In [571]:
%matplotlib inline
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error


In [572]:
data = pd.read_csv('../player_count_data/all_player_count.csv')

In [573]:
data.head()

Unnamed: 0,Date,Time,title,releasedate,developer,publisher,category,player_count
0,9/21/2023,15:57,PLAYERUNKNOWNS BATTLEGROUNDS,12/21/2017,PUBG Corporation,PUBG Corporation,top_1000,1410640
1,9/21/2023,15:58,PLAYERUNKNOWNS BATTLEGROUNDS,12/21/2017,PUBG Corporation,PUBG Corporation,top_1000,1410640
2,9/21/2023,15:59,PLAYERUNKNOWNS BATTLEGROUNDS,12/21/2017,PUBG Corporation,PUBG Corporation,top_1000,1410640
3,9/21/2023,16:00,PLAYERUNKNOWNS BATTLEGROUNDS,12/21/2017,PUBG Corporation,PUBG Corporation,top_1000,1410640
4,9/21/2023,16:01,PLAYERUNKNOWNS BATTLEGROUNDS,12/21/2017,PUBG Corporation,PUBG Corporation,top_1000,1410640


In [574]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2882 entries, 0 to 2881
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          2882 non-null   object
 1   Time          2882 non-null   object
 2   title         2882 non-null   object
 3   releasedate   2882 non-null   object
 4   developer     2882 non-null   object
 5   publisher     2882 non-null   object
 6   category      2882 non-null   object
 7   player_count  2882 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 180.2+ KB


In [575]:
data = data.drop(data[(data["player_count"]== 0)].index)

In [576]:
data["date"] = pd.to_datetime(data['Date'], format='%m/%d/%Y')

In [577]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2862 entries, 0 to 2881
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          2862 non-null   object        
 1   Time          2862 non-null   object        
 2   title         2862 non-null   object        
 3   releasedate   2862 non-null   object        
 4   developer     2862 non-null   object        
 5   publisher     2862 non-null   object        
 6   category      2862 non-null   object        
 7   player_count  2862 non-null   int64         
 8   date          2862 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 223.6+ KB


In [578]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2862 entries, 0 to 2881
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          2862 non-null   object        
 1   Time          2862 non-null   object        
 2   title         2862 non-null   object        
 3   releasedate   2862 non-null   object        
 4   developer     2862 non-null   object        
 5   publisher     2862 non-null   object        
 6   category      2862 non-null   object        
 7   player_count  2862 non-null   int64         
 8   date          2862 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 223.6+ KB


### Feature engineering (transform data to be used for the machine learning algo) 

In [579]:
data["release_date"] = pd.to_datetime(data['releasedate'], format='%m/%d/%Y')
data["time"] = pd.to_datetime(data['Time'], format='%H:%M')

In [580]:
data.drop(columns=["Date", "releasedate", "Time"], inplace=True)

In [581]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2862 entries, 0 to 2881
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   title         2862 non-null   object        
 1   developer     2862 non-null   object        
 2   publisher     2862 non-null   object        
 3   category      2862 non-null   object        
 4   player_count  2862 non-null   int64         
 5   date          2862 non-null   datetime64[ns]
 6   release_date  2862 non-null   datetime64[ns]
 7   time          2862 non-null   datetime64[ns]
dtypes: datetime64[ns](3), int64(1), object(4)
memory usage: 201.2+ KB


In [582]:
data["year"] = data["date"].dt.year
data["month"] = data["date"].dt.month
data["day"] = data["date"].dt.day
data["hour"] = data["time"].dt.hour
data["min"] = data["time"].dt.minute


In [583]:
data["release_year"] = data["release_date"].dt.year
data["release_month"] = data["release_date"].dt.month
data["release_day"] = data["release_date"].dt.day

In [584]:
data.drop(columns=["date", "release_date", "time"], inplace=True)

In [585]:
data.head()

Unnamed: 0,title,developer,publisher,category,player_count,year,month,day,hour,min,release_year,release_month,release_day
0,PLAYERUNKNOWNS BATTLEGROUNDS,PUBG Corporation,PUBG Corporation,top_1000,1410640,2023,9,21,15,57,2017,12,21
1,PLAYERUNKNOWNS BATTLEGROUNDS,PUBG Corporation,PUBG Corporation,top_1000,1410640,2023,9,21,15,58,2017,12,21
2,PLAYERUNKNOWNS BATTLEGROUNDS,PUBG Corporation,PUBG Corporation,top_1000,1410640,2023,9,21,15,59,2017,12,21
3,PLAYERUNKNOWNS BATTLEGROUNDS,PUBG Corporation,PUBG Corporation,top_1000,1410640,2023,9,21,16,0,2017,12,21
4,PLAYERUNKNOWNS BATTLEGROUNDS,PUBG Corporation,PUBG Corporation,top_1000,1410640,2023,9,21,16,1,2017,12,21


### One hot encoding

In [586]:
one_hot_encoded_data = pd.get_dummies(data, columns = ['title', 'developer', "publisher", "category"])
one_hot_encoded_data.head()

Unnamed: 0,player_count,year,month,day,hour,min,release_year,release_month,release_day,title_Counter-Strike,title_PLAYERUNKNOWNS BATTLEGROUNDS,developer_PUBG Corporation,developer_Valve,publisher_PUBG Corporation,publisher_Valve,category_top_1000
0,1410640,2023,9,21,15,57,2017,12,21,0,1,1,0,1,0,1
1,1410640,2023,9,21,15,58,2017,12,21,0,1,1,0,1,0,1
2,1410640,2023,9,21,15,59,2017,12,21,0,1,1,0,1,0,1
3,1410640,2023,9,21,16,0,2017,12,21,0,1,1,0,1,0,1
4,1410640,2023,9,21,16,1,2017,12,21,0,1,1,0,1,0,1


### Preapering for training 

In [587]:
X = X = one_hot_encoded_data.iloc[:, 1:]
y = one_hot_encoded_data["player_count"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Models and testing

#### Random forest

In [588]:
model_random_forest = RandomForestRegressor()
model_random_forest.fit(X_train, y_train)

In [589]:
y_pred = model_random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 20647.876301602493


#### Linear regression


In [591]:
model_linear = LinearRegression()
model_linear.fit(X, y)

In [592]:
y_pred = model_linear.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 526890.9126192376


In [None]:
model_filename = 'linear_regression_model.pkl'
joblib.dump(model, model_filename)