# Data Preprocessing

In [2]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')

## Load the dataset

In [None]:
df = pd.read_csv('games-regression-dataset.csv')

## Setting data types

In [None]:
# drop Primary Genre
df.drop(['Primary Genre', 'ID', 'URL', 'Icon URL'], axis=1, inplace=True)

df['Original Release Date'] = pd.to_datetime(df['Original Release Date'], format='%d/%m/%Y')
df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'], format='%d/%m/%Y')


## Data Exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

genre_counts = df.explode('Genres').groupby('Genres').size().sort_values(ascending=False)
genre_counts


In [None]:
df['Developer'].value_counts()

In [None]:
df['Developer'].unique().size

In [None]:
df['Languages'] = df['Languages'].astype(str)

df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

langs_counts = df.explode('Languages').groupby('Languages').size().sort_values(ascending=False)
langs_counts

## Developer preprocessing

In [None]:
# Convert to string
df['Developer'] = df['Developer'].astype(str)
df['Developer'] = df['Developer'].str.replace("'", "").str.strip('[]')

dev_counts = df['Developer'].value_counts()
other = dev_counts[dev_counts < 5].index
df['Developer'] = df['Developer'].replace(other, 'Other')

dev_df = df[['Developer', 'Average User Rating']].groupby('Developer').mean()
dev_df['Count'] = df['Developer'].value_counts()

dev_df = dev_df.sort_values(by='Count', ascending=False)
dev_df

In [None]:
# Replace the developer names with the average user rating from dev_df
df['Developer'] = df['Developer'].replace(dev_df.index, dev_df['Average User Rating'])
df.head()

## Genres preprocessing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Join the list of genres into a single string
genres = df['Genres'].apply(lambda x: ' '.join(x))

# Create a count Vectorizer and fit it to the genres
count_vec = CountVectorizer()
bow_genres = count_vec.fit_transform(genres)

# Apply principal component analysis to reduce the dimensionality
pca = PCA(n_components=10)
pca_genres = pca.fit_transform(bow_genres.toarray())

# Add the PCA-transformed genres to the original dataframe
for i in range(10):
    df[f'Genre_PCA_{i}'] = pca_genres[:, i]

# Drop the original column
df = df.drop(['Genres'], axis=1)

## Languages preprocessing

In [None]:
# Convert the genres column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['En']])

# Join the list of genres into a single string
languages = df['Languages'].apply(lambda x: ' '.join(x))

# Create a count Vectorizer and fit it to the genres
count_vec = CountVectorizer()
bow_languages = count_vec.fit_transform(languages)

# Apply principal component analysis to reduce the dimensionality
pca = PCA(n_components=10)
pca_languages = pca.fit_transform(bow_languages.toarray())

# Add the PCA-transformed genres to the original dataframe
for i in range(10):
    df[f'Languages_PCA_{i}'] = pca_languages[:, i]

# Drop the original column
df = df.drop(['Languages'], axis=1)

## In-app Purchases preprocessing

In [None]:
# Free apps might skew the in-app purchases column,
# so we might split the dataset into free and paid apps

df['In-app Purchases'] = df['In-app Purchases'].astype(str)
df['In-app Purchases'] = df['In-app Purchases'].str.strip('[]').str.replace("'", "").str.split(", ")

In [None]:
# Convert to float
df['In-app Purchases'] = df['In-app Purchases'].apply(lambda x: [float(i) for i in x])

# Get the lowest, highest and average purchase
df['Lowest Purchase'] = df['In-app Purchases'].apply(lambda x: min(x) if len(x) > 0 else 0)
df['Highest Purchase'] = df['In-app Purchases'].apply(lambda x: max(x) if len(x) > 0 else 0)
df['Average Purchase'] = df['In-app Purchases'].apply(lambda x: np.mean(x) if len(x) > 0 else 0)


In [None]:
# Drop the original column
df = df.drop(['In-app Purchases'], axis=1)

df['Lowest Purchase'] = df['Lowest Purchase'].fillna(0)
df['Highest Purchase'] = df['Highest Purchase'].fillna(0)
df['Average Purchase'] = df['Average Purchase'].fillna(0)
df.head()

## Age Rating preprocessing

In [None]:
# Convert to string
df['Age Rating'] = df['Age Rating'].astype(str)

# Remove the + sign
df['Age Rating'] = df['Age Rating'].str.replace('+', '')

# Convert to int
df['Age Rating'] = df['Age Rating'].astype(float)

## Dates preprocessing

In [None]:
# Convert the datetime to ordinal
df['Original Release Date'] = df['Original Release Date'].apply(lambda x: x.toordinal())
df['Current Version Release Date'] = df['Current Version Release Date'].apply(lambda x: x.toordinal())

df.head()


## NLP preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from textblob import TextBlob
import re

def preprocess_nlp(col):
    df[col] = df[col].astype(str)

    # Remove URLs and email addresses
    df[col] = df[col].apply(lambda x: re.sub(r'http\S+|www.\S+|\S+@\S+', '', x))

    # Remove the punctuation, numbers, and convert to lowercase
    df[col] = df[col].apply(lambda x: " ".join(re.findall(r'\w+', x.lower())))

    # Remove the stopwords
    stop = stopwords.words('english')
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    # Stemming
    st = nltk.PorterStemmer()
    df[col] = df[col].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

    # Lemmatization
    lem = nltk.WordNetLemmatizer()
    df[col] = df[col].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))

    # Remove the frequent and rare words
    freq = pd.Series(' '.join(df[col]).split()).value_counts()
    common_freq = list(freq[:10].index)
    rare_freq = list(freq[-10:].index)
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in common_freq+rare_freq))

    # Remove the whitespaces
    df[col] = df[col].apply(lambda x: " ".join(x.strip() for x in x.split()))

    # Replace NaN values with empty string
    df[col] = df[col].fillna('')

    # Convert text data to bag-of-words representation
    vectorizer = CountVectorizer()
    BoW = vectorizer.fit_transform(df[col])

    # Apply principal component analysis to reduce the dimensionality
    pca_ = PCA(n_components=10)
    pca_col = pca_.fit_transform(BoW.toarray())

    # Add the PCA-transformed genres to the original dataframe
    for feat in range(10):
        df[f'{col}_PCA_{feat}'] = pca_col[:, feat]



In [None]:
preprocess_nlp('Description')
preprocess_nlp('Subtitle')
preprocess_nlp('Name')

df = df.drop(['Description', 'Subtitle', 'Name'], axis=1)

df.head()


## Icon preprocessing

### Download the icons

In [None]:
# Convert to string
df['Icon URL'] = df['Icon URL'].astype(str)

import requests
import os
import shutil


def download_image(url, filename):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(filename, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)


# Create a folder to store the images
if not os.path.exists('icons'):
    os.makedirs('icons')

# Download the images
for i, row in df.iterrows():
    download_image(row['Icon URL'], f'icons/{i}.png')



### Extract features from the icons

In [None]:
import cv2
import numpy as np

def preprocess_icon(img_path):
    # Load the game icon image
    img = cv2.imread(img_path)
    img = cv2.resize(img, (100, 100))

    # Extract color features using color histograms
    colors = ('b', 'g', 'r')
    color_features = []
    for k, col in enumerate(colors):
        hist = cv2.calcHist([img], [k], None, [256], [0, 256])
        color_features.append(hist)

    # Reshape the color features to have a single dimension
    color_features = np.concatenate(color_features).ravel()

    # Extract shape features using edge detection
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_features = np.array(edges).flatten()

    # Combine the color and shape features into a single feature vector
    feature_vector = np.concatenate((color_features, edge_features))

    # Normalize the feature vector to have unit length
    normalized_feature_vector = feature_vector / np.linalg.norm(feature_vector)

    return normalized_feature_vector

# Create a list to store the feature vectors
icon_features = []

# Iterate over the images and extract the features
for i, row in df.iterrows():
    icon_features.append(preprocess_icon(f'icons/{i}.png'))

# Apply PCA to reduce the number of features
pca = PCA(n_components=10)
icon_features_pca = pca.fit_transform(icon_features)

icon_features_df = pd.DataFrame(icon_features_pca, columns=[f'icon_{i}' for i in range(icon_features_pca.shape[1])])
icon_features_df.head()

### Add the icon features to the dataset

In [None]:
# Concatenate the icon features with the other features
df = pd.concat([df, icon_features_df], axis=1)

# Save the updated dataset
df.to_csv('games_with_icon_features.csv', index=False)

In [None]:
df.head()

## Run the model

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('games_with_icon_features.csv')

# Split the dataset into training and testing sets
y = df['Average User Rating']
features = df.drop(['Average User Rating'], axis=1)

# Feature selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=10)
X = selector.fit_transform(features, y)

# print the selected features
print(selector.get_support(indices=True))
seleted_features = features.columns[selector.get_support(indices=True)]
print(seleted_features)

# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[ 2  5  6 10 28 29 36 41 52 56]
Index(['Developer', 'Original Release Date', 'Current Version Release Date',
       'Genre_PCA_3', 'Highest Purchase', 'Average Purchase',
       'Description_PCA_6', 'Subtitle_PCA_1', 'Name_PCA_2', 'Name_PCA_6'],
      dtype='object')


In [5]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Save the model
import pickle
pickle.dump(model, open('models/LR_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

# print the features weights
for i in range(len(model.coef_)):
    print(f'Feature {selected_features[i]}: {model.coef_[i]}')


Mean squared error: 0.49
Coefficient of determination: 0.15


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
# Create a ridge regression model
model = Ridge(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/Ridge_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


In [None]:
# Create a lasso regression model
model = Lasso(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/Lasso_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


In [None]:
# Create an elastic net regression model
model = ElasticNet(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/ElasticNet_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


In [None]:
# Create a polynomial regression model
poly = PolynomialFeatures(degree=3)

X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Save the model
pickle.dump(model, open('models/Polynomial_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test_poly)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
