# Test & Results

## Preparing the data

In [32]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('df_test.csv')

# Genres & Languages

In [33]:
# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Load saved genres dummy variables
saved_dummies = pd.read_csv('encoders/genres.csv')

# Get the genres that are not in the saved dummy variables
other = [genre for genre in df['Genres'].explode().unique() if genre not in saved_dummies.columns]

# Replace the genres that are not in the saved dummy variables with 'infrequent'
df['Genres'] = df['Genres'].apply(lambda x: ['infrequent' if genre in other else genre for genre in x])

# Preprocess test data using the saved dummy variables
test_dummies = pd.get_dummies(df['Genres'].apply(pd.Series).stack(), prefix="genre", dummy_na=False).sum(level=0)
test_dummies = test_dummies.reindex(columns=saved_dummies.columns, fill_value=0)

# Fill the dummy columns with 0 if nan
test_dummies = test_dummies.fillna(0)

# Concatenate dummies to original DataFrame
df = pd.concat([df, test_dummies], axis=1)

# Fill NaN with 0
genre_cols = [col for col in df.columns if col.startswith('genre')] # get all columns with prefix 'genre'
df[genre_cols] = df[genre_cols].fillna(0) # fill NaN with 0 for selected columns

# Drop the original Genres column
df = df.drop('Genres', axis=1)

In [None]:
# Convert the langs column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# Drop the English language from the Languages column (it is the most common language and would dominate the model)
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['EN']])

# Load saved languages dummy variables
saved_dummies = pd.read_csv('encoders/langs.csv')

# Get the languages that are not in the saved dummy variables
other = [lang for lang in df['Languages'].explode().unique() if lang not in saved_dummies.columns]

# Replace the languages that are not in the saved dummy variables with 'infrequent'
df['Languages'] = df['Languages'].apply(lambda x: ['infrequent' if lang in other else lang for lang in x])

# Preprocess test data using the saved dummy variables
test_dummies = pd.get_dummies(df['Languages'].apply(pd.Series).stack(), prefix="lang", dummy_na=False).sum(level=0)
test_dummies = test_dummies.reindex(columns=saved_dummies.columns, fill_value=0)

# Fill the dummy columns with 0 if nan
test_dummies = test_dummies.fillna(0)

# Fill NaN with 0
lang_cols = [col for col in df.columns if col.startswith('lang')] # get all columns with prefix 'lang'
df[lang_cols] = df[lang_cols].fillna(0) # fill NaN with 0 for selected columns

# Concatenate dummies to original DataFrame
df = pd.concat([df, test_dummies], axis=1)
