In [1]:
# Import initial dependencies

# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

There are three different versions of the data set each using different approaches to columns and binning methods. In this first part we pass each of the data sets through a standard linear regression model and examine the performance of the model. 

In the second part we will pass the data set through the XGBoost model and compare its predictive performance vs the standard linear regression model. 

# Part 1: Linear Regression Models

In [2]:
# Import the data sets

df1 = pd.read_csv('regreadyvgsales2.csv')
df2 = pd.read_csv('regreadyvgsales3.csv')
df3 = pd.read_csv('regreadyvgsales4.csv')

Let's examine the features of each of the data sets. 

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Platform         6825 non-null   int64  
 1   Year_of_Release  6825 non-null   float64
 2   Genre            6825 non-null   int64  
 3   Publisher        6825 non-null   int64  
 4   NA_Sales         6825 non-null   float64
 5   EU_Sales         6825 non-null   float64
 6   JP_Sales         6825 non-null   float64
 7   Other_Sales      6825 non-null   float64
 8   Global_Sales     6825 non-null   float64
 9   Critic_Score     6825 non-null   float64
 10  User_Score       6825 non-null   float64
 11  Rating           6825 non-null   int64  
dtypes: float64(8), int64(4)
memory usage: 640.0 KB


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   NA_Sales            6825 non-null   float64
 1   EU_Sales            6825 non-null   float64
 2   JP_Sales            6825 non-null   float64
 3   Other_Sales         6825 non-null   float64
 4   Global_Sales        6825 non-null   float64
 5   Years_on_Market     6825 non-null   float64
 6   Critic_Score        6825 non-null   float64
 7   User_Score          6825 non-null   float64
 8   Genre_Action        6825 non-null   float64
 9   Genre_Adventure     6825 non-null   float64
 10  Genre_Fighting      6825 non-null   float64
 11  Genre_Misc          6825 non-null   float64
 12  Genre_Platform      6825 non-null   float64
 13  Genre_Puzzle        6825 non-null   float64
 14  Genre_Racing        6825 non-null   float64
 15  Genre_Role-Playing  6825 non-null   float64
 16  Genre_

In [12]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Columns: 168 entries, NA_Sales to Platform_XOne_y
dtypes: float64(168)
memory usage: 8.7 MB


## Data Set #1 Linear Regression Model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Target Variable & feature variables
y1 = df1.Global_Sales
X1 = df1.drop(['Global_Sales', 'NA_Sales','EU_Sales', 'JP_Sales','Other_Sales'], axis=1)

# Split data into train & test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=1)

# Create the model
model = LinearRegression()

# Fit model
model.fit(X1_train, y1_train)

# Evaluate model scores
training_score1 = model.score(X1_train, y1_train)
testing_score1 = model.score(X1_test, y1_test)

# Print score results
print(f'Training Score: {training_score1}')
print(f'Testing Score: {testing_score1}')

# Use our model to predict a value
predicted1 = model.predict(X1)

# Score the prediction with MSE and R2
mse_1 = mean_squared_error(y1, predicted1)
r2_1 = r2_score(y1, predicted1)

print(f"mean squared error (MSE): {mse_1}")
print(f"R-squared (R2 ): {r2_1}")


Training Score: 0.06586389374708346
Testing Score: 0.0488981770903949
mean squared error (MSE): 2.3078959016245104
R-squared (R2 ): 0.06445781443655518


## Data Set #2 Linear Regression Model

In [15]:
# Target Variable & feature variables
y2 = df2.Global_Sales
X2 = df2.drop(['Global_Sales', 'NA_Sales','EU_Sales', 'JP_Sales','Other_Sales'], axis=1)

# Split data into train & test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1)

# Fit model
model.fit(X2_train, y2_train)

# Evaluate model scores
training_score2 = model.score(X2_train, y2_train)
testing_score2 = model.score(X2_test, y2_test)

# Print score results
print(f'Training Score: {training_score2}')
print(f'Testing Score: {testing_score2}')

# Use our model to predict a value
predicted2 = model.predict(X2)

# Score the prediction with MSE and R2
mse_2 = mean_squared_error(y2, predicted2)
r2_2 = r2_score(y2, predicted2)

print(f"mean squared error (MSE): {mse_2}")
print(f"R-squared (R2 ): {r2_2}")

Training Score: 0.08204159335761674
Testing Score: 0.11216124409881678
mean squared error (MSE): 8.199766707826239
R-squared (R2 ): 0.08709051201632101


## Data Set #3 Linear Regression Model

In [16]:
# Target Variable & feature variables
y3 = df3.Global_Sales
X3 = df3.drop(['Global_Sales', 'NA_Sales','EU_Sales', 'JP_Sales','Other_Sales'], axis=1)

# Split data into train & test sets
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=1)

# Fit model
model.fit(X3_train, y3_train)

# Evaluate model scores
training_score3 = model.score(X3_train, y3_train)
testing_score3 = model.score(X3_test, y3_test)

# Print score results
print(f'Training Score: {training_score3}')
print(f'Testing Score: {testing_score3}')

# Use our model to predict a value
predicted3 = model.predict(X3)

# Score the prediction with MSE and R2
mse_3 = mean_squared_error(y3, predicted3)
r2_3 = r2_score(y3, predicted3)

print(f"mean squared error (MSE): {mse_3}")
print(f"R-squared (R2 ): {r2_3}")

Training Score: 0.16672358857984726
Testing Score: 0.19349594050854213
mean squared error (MSE): 7.444131604072688
R-squared (R2 ): 0.1712180829887665


# Part 2: XGBoost Regression Model

Info about XGBoost: https://machinelearningmastery.com/xgboost-for-regression/

In [17]:
import xgboost