# Data Preprocessing

## Importing the libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')

## Load the dataset

In [33]:
df_origin = pd.read_csv('games-regression-dataset.csv')

## Data Splitting

In [34]:
from sklearn.model_selection import train_test_split

df, df_test = train_test_split(df_origin, test_size=0.2, random_state=42)
df_test.to_csv('df_test.csv', index=False)

## Setting data types

In [35]:
# drop Primary Genre
df.drop(['Primary Genre', 'ID', 'URL'], axis=1, inplace=True)

df['Original Release Date'] = pd.to_datetime(df['Original Release Date'], errors='coerce', format='%d/%m/%Y')
df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'], errors='coerce', format='%d/%m/%Y')


## Data Exploration

### General information

In [None]:
df.head()

In [None]:
df.info()

In [16]:
df.isnull().sum()

Name                               0
Subtitle                        3027
Icon URL                           0
User Rating Count                  0
Price                              0
In-app Purchases                1608
Description                        0
Developer                          0
Age Rating                         0
Languages                          6
Size                               0
Genres                             0
Original Release Date              0
Current Version Release Date       0
Average User Rating                0
dtype: int64

### Genres

In [None]:
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

genre_counts = df.explode('Genres').groupby('Genres').size().sort_values(ascending=False)
genre_counts


### Developer

In [None]:
df['Developer'].value_counts()

In [None]:
df['Developer'].unique().size

### Languages

In [None]:
df['Languages'] = df['Languages'].astype(str)

df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

langs_counts = df.explode('Languages').groupby('Languages').size().sort_values(ascending=False)
print(langs_counts[1:30])

In [None]:
df.hist(figsize=(15, 15))

## Developer preprocessing

In [25]:
# Convert to string
df['Developer'] = df['Developer'].astype(str)
df['Developer'] = df['Developer'].str.replace("'", "").str.strip('[]')

# Replace the developer names with less than 3 games with 'Other'
dev_counts = df['Developer'].value_counts()
other = dev_counts[dev_counts < 3].index
df['Developer'] = df['Developer'].replace(other, 'Other')

dev_df = df[['Developer', 'Average User Rating']].groupby('Developer').mean()

# Save dev_df to be used on the test set
dev_df.to_csv('encoders/dev_df.csv')

# Replace the developer names with the average user rating from dev_df
df['Developer'] = df['Developer'].replace(dev_df.index, dev_df['Average User Rating'])

## Genres preprocessing

### 1. NLP approach

In [None]:
# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Join the list of genres into a single string
genres = df['Genres'].apply(lambda x: ' '.join(x))

# Create a count Vectorizer and fit it to the genres
count_vec = CountVectorizer()
bow_genres = count_vec.fit_transform(genres)

# Apply principal component analysis to reduce the dimensionality
pca = PCA(n_components=10)
pca_genres = pca.fit_transform(bow_genres.toarray())

# Add the PCA-transformed genres to the original dataframe
for i in range(len(pca_genres[0])):
    df[f'Genre_PCA_{i}'] = pca_genres[:, i]

# Drop the original column
df = df.drop(['Genres'], axis=1)

### 2. Dummy variables approach

In [None]:
# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Replace genres with counts less than 100 with 'infrequent' as it would represent a very small percentage of the data (less than 2%)
other = df['Genres'].explode().value_counts()[df['Genres'].explode().value_counts() < 100].index
df['Genres'] = df['Genres'].apply(lambda x: [genre if genre not in other else 'infrequent' for genre in x])

# Replace empty lists with 'infrequent'
df['Genres'] = df['Genres'].apply(lambda x: ['infrequent'] if len(x) == 0 else x)

# Get dummy variables for the genres
genres = pd.get_dummies(df['Genres'].apply(pd.Series).stack()).sum(level=0)

# Add the dummy variables to the original dataframe
df = pd.concat([df, genres], axis=1)

# Drop the original column
df = df.drop(['Genres'], axis=1)

df.iloc[:, 15:].head()


### 3. Multi-label binarizer

In [26]:
# Convert the genres column to a list of strings
df['Genres'] = df['Genres'].astype(str)
df['Genres'] = df['Genres'].str.strip('[]').str.replace("'", "").str.split(", ")

# drop Games, Strategy, Entertainment from the Genres column
df['Genres'] = df['Genres'].apply(lambda x: [genre for genre in x if genre not in ['Games', 'Strategy', 'Entertainment']])

# Replace genres with counts less than 100 with 'infrequent' as it would represent a very small percentage of the data (less than 2%)
other = df['Genres'].explode().value_counts()[df['Genres'].explode().value_counts() < 100].index
df['Genres'] = df['Genres'].apply(lambda x: [genre if genre not in other else 'infrequent_genre' for genre in x])

# Instantiate the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer to the genres
mlb.fit(df['Genres'])

# Drop nan from the classes
mlb.classes_ = np.delete(mlb.classes_, np.where(mlb.classes_ == 'nan'))

# Save the mlb for later use with the test data
pickle.dump(mlb, open('encoders/mlb_genres.pkl', 'wb'))

# Transform the genres into a one-hot encoded array
genres_mlb = mlb.transform(df['Genres'])

# Create a dataframe from the one-hot encoded array
genres_mlb_df = pd.DataFrame(genres_mlb, columns=mlb.classes_)

# Add the one-hot encoded genres to the original dataframe
df = pd.concat([df, genres_mlb_df], axis=1)

# Drop the original column
df = df.drop(['Genres'], axis=1)

print(df.shape)

df.iloc[:, 15:].head()

(4988, 26)


Unnamed: 0,Adventure,Board,Card,Casual,Education,Family,Puzzle,Role Playing,Simulation,Sports,infrequent_genre
2173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
927,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Languages preprocessing

### 1. NLP approach

In [None]:
# Convert the langs column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# Drop the English language from the Languages column
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['EN']])

# Join the list of langs into a single string
languages = df['Languages'].apply(lambda x: ' '.join(x))

# Create a count Vectorizer and fit it to the langs
count_vec = CountVectorizer()
bow_languages = count_vec.fit_transform(languages)

# Apply principal component analysis to reduce the dimensionality
pca = PCA(n_components=10)
pca_languages = pca.fit_transform(bow_languages.toarray())

# Add the PCA-transformed langs to the original dataframe
for i in range(len(pca_languages[0])):
    df[f'Languages_PCA_{i}'] = pca_languages[:, i]

# Drop the original column
df = df.drop(['Languages'], axis=1)

print(df.shape)

df.iloc[:, 15:].head()

### 2. Dummy variables approach

In [None]:
# Convert the langs column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# Create a column with the number of languages supported
df['langs_count'] = df['Languages'].apply(lambda x: len(x))

# Drop the English language from the Languages column (it is the most common language and would dominate the model)
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['EN']])

# Replace langs with counts less than 500 with 'infrequent_langs' as it would represent a very small percentage of the data (less than 10%)
other = df['Languages'].explode().value_counts()[df['Languages'].explode().value_counts() < 500].index
df['Languages'] = df['Languages'].apply(lambda x: [lang if lang not in other else 'infrequent_langs' for lang in x])

# Replace empty lists with 'infrequent'
df['Languages'] = df['Languages'].apply(lambda x: ['infrequent_langs'] if len(x) == 0 else x)

# Get dummy variables for the langs
langs = pd.get_dummies(df['Languages'].apply(pd.Series).stack()).sum(level=0)

# Add the dummy variables to the original dataframe
df = pd.concat([df, langs], axis=1)

# Drop the original column
df = df.drop(['Languages'], axis=1)

print(df.shape)

df.iloc[:, 15:].head()

### 3. Multi-label binarizer

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert the langs column to a list of strings
df['Languages'] = df['Languages'].astype(str)
df['Languages'] = df['Languages'].str.strip('[]').str.replace("'", "").str.split(", ")

# Create a column with the number of languages supported
df['langs_count'] = df['Languages'].apply(lambda x: len(x))

# Drop the English language from the Languages column (it is the most common language and would dominate the model)
df['Languages'] = df['Languages'].apply(lambda x: [lang for lang in x if lang not in ['EN']])

# Replace langs with counts less than 500 with 'infrequent_langs' as it would represent a very small percentage of the data (less than 10%)
other = df['Languages'].explode().value_counts()[df['Languages'].explode().value_counts() < 400].index
df['Languages'] = df['Languages'].apply(lambda x: [lang if lang not in other else 'infrequent_lang' for lang in x])

# Instantiate the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer to the langs
mlb.fit(df['Languages'])

# Drop nan from the classes
mlb.classes_ = np.delete(mlb.classes_, np.where(mlb.classes_ == 'nan'))

# Save the mlb for later use with the test data
pickle.dump(mlb, open('encoders/mlb_langs.pkl', 'wb'))

# Transform the langs into a one-hot encoded array
langs_mlb = mlb.transform(df['Languages'])

# Create a dataframe from the one-hot encoded array
langs_mlb_df = pd.DataFrame(langs_mlb, columns=mlb.classes_)

# Add the encoded langs to the original dataframe
df = pd.concat([df, langs_mlb_df], axis=1)

# Drop the original column
df = df.drop(['Languages'], axis=1)

print(df.shape)

df.iloc[:, 15:].head()

(5166, 36)


Unnamed: 0,Board,Card,Casual,Education,Family,Puzzle,Role Playing,Simulation,Sports,infrequent_genre,...,DE,ES,FR,IT,JA,KO,PT,RU,ZH,infrequent_lang
2173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
927,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1741,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## In-app Purchases preprocessing

In [38]:
# Free apps might skew the in-app purchases column,
# so we might split the dataset into free and paid apps

df['In-app Purchases'] = df['In-app Purchases'].astype(str)
df['In-app Purchases'] = df['In-app Purchases'].str.strip('[]').str.replace("'", "").str.split(", ")

In [39]:
# Convert to float
df['In-app Purchases'] = df['In-app Purchases'].apply(lambda x: [float(i) for i in x])

# Get the number of in-app purchases
df['purchases_count'] = df['In-app Purchases'].apply(lambda x: len(x))

# Get the lowest, highest and average purchase
df['lowest_purchase'] = df['In-app Purchases'].apply(lambda x: min(x) if len(x) > 0 else 0)
df['highest_purchase'] = df['In-app Purchases'].apply(lambda x: max(x) if len(x) > 0 else 0)
df['average_purchase'] = df['In-app Purchases'].apply(lambda x: np.mean(x) if len(x) > 0 else 0)


In [40]:
# Drop the original column
df = df.drop(['In-app Purchases'], axis=1)

df['lowest_purchase'] = df['lowest_purchase'].fillna(0)
df['highest_purchase'] = df['highest_purchase'].fillna(0)
df['average_purchase'] = df['average_purchase'].fillna(0)
df.head()

Unnamed: 0,Name,Subtitle,Icon URL,User Rating Count,Price,Description,Developer,Age Rating,Languages,Size,Genres,Original Release Date,Current Version Release Date,Average User Rating,purchases_count,lowest_purchase,highest_purchase,average_purchase
2173,Digfender,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,348,0.0,"""Digfender is easily one of the best tower def...",Mugshot Games Pty Ltd,9.0,EN,479550464,"Games, Strategy, Puzzle",735920,736359,5.0,10,1.99,19.99,6.69
927,TowerMadness Zero,Classic Defense Strategy,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,95466,0.0,"""Addictive, time-sucking fun. There are plenty...",Limbic Software,9.0,"EN, DE",58547200,"Games, Puzzle, Strategy",733705,736758,4.0,10,1.99,5.99,2.49
2499,Free Gems Guide Calculator for Clash Of Clans ...,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,822,0.0,Welcome to your new favorite Clash of Clans Ca...,Tu Anh Do,17.0,EN,22884352,"Games, Utilities, Strategy, Casual",735558,735977,3.5,1,0.0,0.0,0.0
45,Memory Sequence - Brain Game,"Bounce, Bounce, Bounce",https://is1-ssl.mzstatic.com/image/thumb/Purpl...,14,0.0,"""Bounce, Bounce, Bounce your way past each obs...",OSUAPP LTD,4.0,EN,164545536,"Games, Strategy, Casual, Education",736684,737228,4.5,2,2.99,2.99,2.99
1741,NEO Scavenger,Post-apocalyptic survival RPG,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,147,0.0,Play the acclaimed PC survival RPG on your tab...,"Blue Bottle Games, LLC",17.0,EN,117317632,"Games, Entertainment, Role Playing, Strategy",736536,736661,4.0,1,9.99,9.99,9.99


## Age Rating preprocessing

In [37]:
# Convert to string
df['Age Rating'] = df['Age Rating'].astype(str)

# Remove the + sign
df['Age Rating'] = df['Age Rating'].str.replace('+', '')

# Convert to int
df['Age Rating'] = df['Age Rating'].astype(float)

## Dates preprocessing

In [36]:
# Convert the datetime to ordinal
df['Original Release Date'] = df['Original Release Date'].apply(lambda x: x.toordinal())
df['Current Version Release Date'] = df['Current Version Release Date'].apply(lambda x: x.toordinal())

## NLP preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from textblob import TextBlob
import re

def preprocess_nlp(col):
    df[col] = df[col].astype(str)

    # Remove URLs and email addresses
    df[col] = df[col].apply(lambda x: re.sub(r'http\S+|www.\S+|\S+@\S+', '', x))

    # Remove the punctuation, numbers, and convert to lowercase
    df[col] = df[col].apply(lambda x: " ".join(re.findall(r'\w+', x.lower())))

    # Remove the stopwords
    stop = stopwords.words('english')
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    # Stemming
    st = nltk.PorterStemmer()
    df[col] = df[col].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

    # Lemmatization
    lem = nltk.WordNetLemmatizer()
    df[col] = df[col].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))

    # Remove the frequent and rare words
    freq = pd.Series(' '.join(df[col]).split()).value_counts()
    common_freq = list(freq[:10].index)
    rare_freq = list(freq[-10:].index)
    df[col] = df[col].apply(lambda x: " ".join(x for x in x.split() if x not in common_freq+rare_freq))

    # Remove the whitespaces
    df[col] = df[col].apply(lambda x: " ".join(x.strip() for x in x.split()))

    # Replace NaN values with empty string
    df[col] = df[col].fillna('')

    # Convert text data to bag-of-words representation
    vectorizer = CountVectorizer()
    BoW = vectorizer.fit_transform(df[col])

    # Apply principal component analysis to reduce the dimensionality
    pca_ = PCA(n_components=10)
    pca_col = pca_.fit_transform(BoW.toarray())

    # Add the PCA-transformed col to the original dataframe
    for feat in range(10):
        df[f'{col}_PCA_{feat}'] = pca_col[:, feat]



In [None]:
preprocess_nlp('Description')
preprocess_nlp('Subtitle')
preprocess_nlp('Name')

df = df.drop(['Description', 'Subtitle', 'Name'], axis=1)

df.head()


## Icon preprocessing

### Download the icons

In [None]:
# Convert to string
df['Icon URL'] = df['Icon URL'].astype(str)

import requests
import os
import shutil


def download_image(url, filename):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(filename, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)


# Create a folder to store the images
if not os.path.exists('icons'):
    os.makedirs('icons')

# Download the images
for i, row in df.iterrows():
    download_image(row['Icon URL'], f'icons/{i}.png')



In [None]:
# Replace the URL with the icon filename which is the index of the row
df['Icon URL'] = df.apply(lambda row : f'icons/{row.name}.png', axis=1)
df.head()

### Extract features from the icons

In [None]:
import cv2

def preprocess_icon(img_path):
    # Load the game icon image
    img = cv2.imread(img_path)
    img = cv2.resize(img, (100, 100))

    # Extract color features using color histograms
    colors = ('b', 'g', 'r')
    color_features = []
    for k, col in enumerate(colors):
        hist = cv2.calcHist([img], [k], None, [256], [0, 256])
        color_features.append(hist)

    # Reshape the color features to have a single dimension
    color_features = np.concatenate(color_features).ravel()

    # Extract shape features using edge detection
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_features = np.array(edges).flatten()

    # Combine the color and shape features into a single feature vector
    feature_vector = np.concatenate((color_features, edge_features))

    # Normalize the feature vector to have unit length
    normalized_feature_vector = feature_vector / np.linalg.norm(feature_vector)

    return normalized_feature_vector

# Create a list to store the feature vectors
icon_features = []

# Iterate over the images and extract the features
for i, row in df.iterrows():
    icon_features.append(preprocess_icon(row['Icon URL']))
    
# Apply PCA to reduce the number of features
pca = PCA(n_components=10)
icon_features_pca = pca.fit_transform(icon_features)

icon_features_df = pd.DataFrame(icon_features_pca, columns=[f'icon_{i}' for i in range(icon_features_pca.shape[1])])
icon_features_df.head()

### Add the icon features to the dataset

In [None]:
# Concatenate the icon features with the other features
df = pd.concat([df, icon_features_df], axis=1)

# Drop the icon URL column
df = df.drop(['Icon URL'], axis=1)

In [None]:
df.head()

## Save the preprocessed dataset

In [None]:
df.to_csv('df_train.csv', index=False)

### Feature Scaling

In [None]:
# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Save the scaler
pickle.dump(scaler, open('scalers/std_scaler.pkl', 'wb'))

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

# Save the scaler
pickle.dump(scaler, open('scalers/min_max_scaler.pkl', 'wb'))

### Feature Selection

In [None]:
# Feature selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=10)
X = selector.fit_transform(features, y)

# print the selected features
print(selector.get_support(indices=True))
seleted_features = features.columns[selector.get_support(indices=True)]
print(seleted_features)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/LR_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

# print the features weights
for i in range(len(model.coef_)):
    print(f'Feature {selected_features[i]}: {model.coef_[i]}')


### Ridge Regression

In [None]:
# Create a ridge regression model
model = Ridge(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/Ridge_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


### Lasso Regression

In [None]:
# Create a lasso regression model
model = Lasso(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/Lasso_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


### Elastic Net Regression

In [None]:
# Create an elastic net regression model
model = ElasticNet(alpha=0.5)

# Train the model
model.fit(X_train, y_train)

# Save the model
pickle.dump(model, open('models/ElasticNet_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


### Polynomial Regression

In [None]:
# Create a polynomial regression model
poly = PolynomialFeatures(degree=3)

X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Save the model
pickle.dump(model, open('models/Polynomial_model.pkl', 'wb'))

# Make predictions
y_pred = model.predict(X_test_poly)

# Evaluate the model
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
