In [5]:
%matplotlib inline
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error


In [6]:
data = pd.read_csv('../player_count_data/PUBG_week.csv')

In [8]:
data.head(8)

Unnamed: 0,Date,Time,Player_count
0,9/23/2023,1:25:00,2722578
1,9/23/2023,1:30:00,2718172
2,9/23/2023,1:35:00,2712038
3,9/23/2023,1:40:00,2702890
4,9/23/2023,1:45:00,2691888
5,9/23/2023,1:50:00,2679008
6,9/23/2023,1:55:00,2665251
7,9/23/2023,2:00:00,2650501


In [222]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2017 entries, 0 to 2016
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          2017 non-null   object
 1   Time          2017 non-null   object
 2   Player_count  2017 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.4+ KB


In [223]:
data = data.drop(data[(data["Player_count"]== 0)].index)

In [224]:
data["New_date"] = pd.to_datetime(data['Date'], format='%m/%d/%Y')

In [225]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2008 entries, 0 to 2016
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          2008 non-null   object        
 1   Time          2008 non-null   object        
 2   Player_count  2008 non-null   int64         
 3   New_date      2008 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 78.4+ KB


In [226]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2008 entries, 0 to 2016
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          2008 non-null   object        
 1   Time          2008 non-null   object        
 2   Player_count  2008 non-null   int64         
 3   New_date      2008 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 78.4+ KB


### Feature engineering (transform data to be used for the machine learning algo) 

In [227]:

data["New_time"] = pd.to_datetime(data['Time'], format='%H:%M:%S')

In [228]:
data["year"] = data["New_date"].dt.year
data["month"] = data["New_date"].dt.month
data["day"] = data["New_date"].dt.day
data["hour"] = data["New_time"].dt.hour
data["min"] = data["New_time"].dt.minute

In [229]:
data.drop(columns=["Date","Time","New_date","New_time"], inplace=True)

In [230]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2008 entries, 0 to 2016
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Player_count  2008 non-null   int64
 1   year          2008 non-null   int64
 2   month         2008 non-null   int64
 3   day           2008 non-null   int64
 4   hour          2008 non-null   int64
 5   min           2008 non-null   int64
dtypes: int64(6)
memory usage: 109.8 KB


In [231]:
data.head()

Unnamed: 0,Player_count,year,month,day,hour,min
0,2722578,2023,9,23,1,25
1,2718172,2023,9,23,1,30
2,2712038,2023,9,23,1,35
3,2702890,2023,9,23,1,40
4,2691888,2023,9,23,1,45


### One hot encoding

### Preapering for training 

In [232]:
X = data.iloc[:, 1:]
y = data["Player_count"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Models and testing

#### Random forest

In [233]:
model_random_forest = RandomForestRegressor()
model_random_forest.fit(X_train, y_train)

In [234]:
y_pred = model_random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 25466.85522864432


#### Linear regression


In [235]:
model_linear = LinearRegression()
model_linear.fit(X, y)

In [236]:
y_pred = model_linear.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 759250.9114347548


In [237]:
model_filename = 'PUBG_week_random_forest_model.pkl'
joblib.dump(model_random_forest, model_filename)

['PUBG_week_random_forest_model.pkl']