# Test & Results

## Preparing the data

In [32]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')

## Load the data

In [None]:
from datetime import datetime
dateparse = lambda x: datetime.strptime(x, '%d/%m/%Y')

df_origin = pd.read_csv('df_test.csv', parse_dates=['Original Release Date' , 'Current Version Release Date'], date_parser=dateparse)
df_old = pd.read_csv('preprocessed_data.csv')

In [None]:
# drop Primary Genre
df_origin.drop(['Primary Genre', 'ID', 'URL'], axis=1, inplace=True)

## Download the icons

In [None]:
# Convert to string
df_origin['Icon URL'] = df_origin['Icon URL'].astype(str)

import requests
import os
import shutil


def download_image(url, filename):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(filename, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)


# Create a folder to store the images
if not os.path.exists('icons_test'):
    os.makedirs('icons_test')

# Download the images
for i, row in df_origin.iterrows():
    download_image(row['Icon URL'], f'icons_test/{i}.png')

In [None]:
# Replace the URL with the icon filename which is the index of the row
df_origin['Icon URL'] = df_origin.apply(lambda row : f'icons_test/{row.name}.png', axis=1)

## Split the data

In [None]:
df = df_origin.drop(['Average User Rating'], axis=1)
df_y = df_origin['Average User Rating']

## Dates, Developer, Age Rating, Purchases and Price

In [None]:
### Dates

# Convert the datetime to ordinal
df['Original Release Date'] = df['Original Release Date'].apply(lambda x: x.toordinal())
df['Current Version Release Date'] = df['Current Version Release Date'].apply(lambda x: x.toordinal())

df['Original Release Date'] = df['Original Release Date'].fillna(df_old['Original Release Date'].median())
df['Current Version Release Date'] = df['Current Version Release Date'].fillna(df_old['Current Version Release Date'].median())

# Create a new column with the age of the game
df['game_age'] = df['Current Version Release Date'] - df['Original Release Date']

# Create a new column with the time since the last update
df['last_update'] = datetime.now().toordinal() - df['Current Version Release Date'] 

In [None]:
### Developer

dev_df = pd.read_csv('dev_df.csv')

# Replace the developer names with the average user rating from dev_df
df['Developer'] = df['Developer'].replace(dev_df.index, dev_df['Average User Rating'])

In [None]:
### Age Rating

# Convert to string
df['Age Rating'] = df['Age Rating'].astype(str)

# Remove the + sign
df['Age Rating'] = df['Age Rating'].str.replace('+', '')

# Convert to int
df['Age Rating'] = df['Age Rating'].astype(float)

# fill the missing values with the median
df['Age Rating'] = df['Age Rating'].fillna(df_old['Age Rating'].median())

In [None]:
### Purchases

# Free apps might skew the in-app purchases column,
# so we might split the dataset into free and paid apps

df['In-app Purchases'] = df['In-app Purchases'].astype(str)
df['In-app Purchases'] = df['In-app Purchases'].str.strip('[]').str.replace("'", "").str.split(", ")

# Convert to float
df['In-app Purchases'] = df['In-app Purchases'].apply(lambda x: [float(i) for i in x])

# Get the number of in-app purchases
df['purchases_count'] = df['In-app Purchases'].apply(lambda x: len(x))

# Get the lowest, highest and average purchase
df['lowest_purchase'] = df['In-app Purchases'].apply(lambda x: min(x) if len(x) > 0 else 0)
df['highest_purchase'] = df['In-app Purchases'].apply(lambda x: max(x) if len(x) > 0 else 0)
df['average_purchase'] = df['In-app Purchases'].apply(lambda x: np.mean(x) if len(x) > 0 else 0)

# Drop the original column
df = df.drop(['In-app Purchases'], axis=1)

df['lowest_purchase'] = df['lowest_purchase'].fillna(0)
df['highest_purchase'] = df['highest_purchase'].fillna(0)
df['average_purchase'] = df['average_purchase'].fillna(0)

In [None]:
### Price
df['Price'] = df['Price'].astype(float)
df['Price'] = df['Price'].fillna(0)

## Genres & Languages

In [33]:
# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Load saved genres dummy variables
saved_dummies = pd.read_csv('encoders/genres.csv')

# Get the genres that are not in the saved dummy variables
other = [genre for genre in df['Genres'].explode().unique() if genre not in saved_dummies.columns]

# Replace the genres that are not in the saved dummy variables with 'infrequent'
df['Genres'] = df['Genres'].apply(lambda x: ['infrequent' if genre in other else genre for genre in x])

# Preprocess test data using the saved dummy variables
test_dummies = pd.get_dummies(df['Genres'].apply(pd.Series).stack(), prefix="genre", dummy_na=False).sum(level=0)
test_dummies = test_dummies.reindex(columns=saved_dummies.columns, fill_value=0)

# Fill the dummy columns with 0 if nan
test_dummies = test_dummies.fillna(0)

# Concatenate dummies to original DataFrame
df = pd.concat([df, test_dummies], axis=1)

# Fill NaN with 0
genre_cols = [col for col in df.columns if col.startswith('genre')] # get all columns with prefix 'genre'
df[genre_cols] = df[genre_cols].fillna(0) # fill NaN with 0 for selected columns

# Drop the original Genres column
df = df.drop('Genres', axis=1)

In [None]:
# Convert the langs column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# Drop the English language from the Languages column (it is the most common language and would dominate the model)
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['EN']])

# Load saved languages dummy variables
saved_dummies = pd.read_csv('encoders/langs.csv')

# Get the languages that are not in the saved dummy variables
other = [lang for lang in df['Languages'].explode().unique() if lang not in saved_dummies.columns]

# Replace the languages that are not in the saved dummy variables with 'infrequent'
df['Languages'] = df['Languages'].apply(lambda x: ['infrequent' if lang in other else lang for lang in x])

# Preprocess test data using the saved dummy variables
test_dummies = pd.get_dummies(df['Languages'].apply(pd.Series).stack(), prefix="lang", dummy_na=False).sum(level=0)
test_dummies = test_dummies.reindex(columns=saved_dummies.columns, fill_value=0)

# Fill the dummy columns with 0 if nan
test_dummies = test_dummies.fillna(0)

# Fill NaN with 0
lang_cols = [col for col in df.columns if col.startswith('lang')] # get all columns with prefix 'lang'
df[lang_cols] = df[lang_cols].fillna(0) # fill NaN with 0 for selected columns

# Concatenate dummies to original DataFrame
df = pd.concat([df, test_dummies], axis=1)


In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

# print the features weights
for i in range(len(model.coef_)):
    print(f'Feature {selected_features[i]}: {model.coef_[i]}')

## NLP preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from textblob import TextBlob
import re

def preprocess_nlp(col):
    df[col] = df[col].astype(str)

    # Remove URLs and email addresses
    df[col] = df[col].apply(lambda x: re.sub(r'http\S+|www.\S+|\S+@\S+', '', x))

    # Remove the punctuation, numbers, and convert to lowercase
    df[col] = df[col].apply(lambda x: " ".join(re.findall(r'\w+', x.lower())))

    # Remove the stopwords
    stop = stopwords.words('english')
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    # Stemming
    st = nltk.PorterStemmer()
    df[col] = df[col].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

    # Lemmatization
    lem = nltk.WordNetLemmatizer()
    df[col] = df[col].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))

    # Remove the frequent and rare words
    freq = pd.Series(' '.join(df[col]).split()).value_counts()
    common_freq = list(freq[:10].index)
    rare_freq = list(freq[-10:].index)
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in common_freq+rare_freq))

    # Remove the whitespaces
    df[col] = df[col].apply(lambda x: " ".join(x.strip() for x in x.split()))

    # Replace NaN values with empty string
    df[col] = df[col].fillna('')

    # Convert text data to bag-of-words representation
    vectorizer = pickle.load(open(f'encoders/vectorizer_{col}.pkl', 'rb'))
    BoW = vectorizer.transform(df[col])

    # Apply principal component analysis to reduce the dimensionality
    pca_ = pickle.load(open(f'encoders/pca_{col}.pkl', 'rb'))
    pca_col = pca_.transform(BoW.toarray())

    # Add the PCA-transformed col to the original dataframe
    for feat in range(len(pca_col[0])):
        df[f'{col}_PCA_{feat}'] = pca_col[:, feat]



In [None]:
preprocess_nlp('Description')
preprocess_nlp('Subtitle')
preprocess_nlp('Name')

df = df.drop(['Description', 'Subtitle', 'Name'], axis=1)

## Icon

In [None]:
import cv2

def preprocess_icon(img_path):
    # Load the game icon image
    img = cv2.imread(img_path)
    img = cv2.resize(img, (100, 100))

    # Extract color features using color histograms
    colors = ('b', 'g', 'r')
    color_features = []
    for k, col in enumerate(colors):
        hist = cv2.calcHist([img], [k], None, [256], [0, 256])
        color_features.append(hist)

    # Reshape the color features to have a single dimension
    color_features = np.concatenate(color_features).ravel()

    # Extract shape features using edge detection
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_features = np.array(edges).flatten()

    # Combine the color and shape features into a single feature vector
    feature_vector = np.concatenate((color_features, edge_features))

    # Normalize the feature vector to have unit length
    normalized_feature_vector = feature_vector / np.linalg.norm(feature_vector)
    
    return normalized_feature_vector


In [None]:
from tqdm import tqdm

# Create a list to store the feature vectors
icon_features = []

df['Icon URL'] = df['Icon URL'].astype(str)

# Iterate over the images and extract the features
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    feature_vec = preprocess_icon(row['Icon URL'])
    icon_features.append((row['Icon URL'], feature_vec))
    
# Apply PCA to reduce the number of features
pca = pickle.load(open('encoders/icon_pca.pkl', 'rb'))

reduced_features = pca.transform([f[1] for f in icon_features])

# Convert the reduced features to a dataframe
icon_features_df = pd.DataFrame({'Icon URL': [f[0] for f in icon_features],
                                    'Icon1': reduced_features[:,0],
                                    'Icon2': reduced_features[:,1],
                                    'Icon3': reduced_features[:,2],
                                    'Icon4': reduced_features[:,3]})

# Merge the icon features with the original dataframe on the icon URL
df = df.merge(icon_features_df, on='Icon URL', how='left')

# Drop the icon URL column
df = df.drop(['Icon URL'], axis=1)

## Scaling and Feature Selection

In [None]:
cols = df.columns

scaler = pickle.load(open('scalers/std_scaler.pkl', 'rb'))
df = scaler.transform(df)

df = pd.DataFrame(df, columns=cols)

In [None]:
selector = pickle.load(open('encoders/selector.pkl', 'rb'))
df = selector.transform(df)

In [None]:
### Models
from sklearn.metrics import mean_squared_error, r2_score

#### Linear Regression
lr_model = pickle.load(open('models/LR_model.pkl', 'rb'))
lr_pred = lr_model.predict(df)

# Calculate the mean squared error and coefficient of determination
print('Linear Regression')
print('Mean squared error: %.2f' % mean_squared_error(df_y, lr_pred))
print('Coefficient of determination: %.2f' % r2_score(df_y, lr_pred))
print('------------------------------------------------------------')

#### Ridge Regression
ridge_model = pickle.load(open('models/Ridge_model.pkl', 'rb'))
ridge_pred = ridge_model.predict(df)

# Calculate the mean squared error and coefficient of determination
print('Ridge Regression')
print('Mean squared error: %.2f' % mean_squared_error(df_y, ridge_pred))
print('Coefficient of determination: %.2f' % r2_score(df_y, ridge_pred))
print('------------------------------------------------------------')

#### Lasso Regression
lasso_model = pickle.load(open('models/Lasso_model.pkl', 'rb'))
lasso_pred = lasso_model.predict(df)

# Calculate the mean squared error and coefficient of determination
print('Lasso Regression')
print('Mean squared error: %.2f' % mean_squared_error(df_y, lasso_pred))
print('Coefficient of determination: %.2f' % r2_score(df_y, lasso_pred))
print('------------------------------------------------------------')

#### ElasticNet Regression
elastic_model = pickle.load(open('models/ElasticNet_model.pkl', 'rb'))
elastic_pred = elastic_model.predict(df)

# Calculate the mean squared error and coefficient of determination
print('ElasticNet Regression')
print('Mean squared error: %.2f' % mean_squared_error(df_y, elastic_pred))
print('Coefficient of determination: %.2f' % r2_score(df_y, elastic_pred))
print('------------------------------------------------------------')

#### Polynomial Regression
poly_model = pickle.load(open('models/Polynomial_model.pkl', 'rb'))
poly_features = pickle.load(open('encoders/poly.pkl', 'rb'))

df_poly = poly_features.transform(df)
poly_pred = poly_model.predict(df_poly)

# Calculate the mean squared error and coefficient of determination
print('Polynomial Regression')
print('Mean squared error: %.2f' % mean_squared_error(df_y, poly_pred))
print('Coefficient of determination: %.2f' % r2_score(df_y, poly_pred))
print('------------------------------------------------------------')