# Predicting Online Game Behavior Dataset

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
df = pd.read_csv('online_gaming_behavior_dataset.csv')

In [3]:
df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium


In [4]:
df.describe()

Unnamed: 0,PlayerID,Age,PlayTimeHours,InGamePurchases,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked
count,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0,40034.0
mean,29016.5,31.992531,12.024365,0.200854,9.471774,94.792252,49.655568,24.526477
std,11556.964675,10.043227,6.914638,0.400644,5.763667,49.011375,28.588379,14.430726
min,9000.0,15.0,0.000115,0.0,0.0,10.0,1.0,0.0
25%,19008.25,23.0,6.067501,0.0,4.0,52.0,25.0,12.0
50%,29016.5,32.0,12.008002,0.0,9.0,95.0,49.0,25.0
75%,39024.75,41.0,17.963831,0.0,14.0,137.0,74.0,37.0
max,49033.0,49.0,23.999592,1.0,19.0,179.0,99.0,49.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

In [6]:
missing_data = df.isnull().sum()
print(missing_data)

PlayerID                     0
Age                          0
Gender                       0
Location                     0
GameGenre                    0
PlayTimeHours                0
InGamePurchases              0
GameDifficulty               0
SessionsPerWeek              0
AvgSessionDurationMinutes    0
PlayerLevel                  0
AchievementsUnlocked         0
EngagementLevel              0
dtype: int64


In [7]:
duplicated_data = df.duplicated().sum()
print(duplicated_data)

0


## Encoding data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['GameDifficulty'] = le.fit_transform(df['GameDifficulty'])

In [9]:
df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,2,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,2,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,0,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,0,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,2,2,131,95,37,Medium


In [10]:
df = pd.get_dummies(df, columns=['Location', 'GameGenre', 'Gender'], drop_first=False)

In [11]:
df.head()

Unnamed: 0,PlayerID,Age,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,...,Location_Europe,Location_Other,Location_USA,GameGenre_Action,GameGenre_RPG,GameGenre_Simulation,GameGenre_Sports,GameGenre_Strategy,Gender_Female,Gender_Male
0,9000,43,16.271119,0,2,6,108,79,25,Medium,...,False,True,False,False,False,False,False,True,False,True
1,9001,29,5.525961,0,2,5,144,11,10,Medium,...,False,False,True,False,False,False,False,True,True,False
2,9002,22,8.223755,0,0,16,142,35,41,High,...,False,False,True,False,False,False,True,False,True,False
3,9003,35,5.265351,1,0,9,85,57,47,Medium,...,False,False,True,True,False,False,False,False,False,True
4,9004,33,15.531945,0,2,2,131,95,37,Medium,...,True,False,False,True,False,False,False,False,False,True


## Splitting the dataset into the Training set and Test set

In [12]:
X = df.drop(columns=['PlayerID', 'EngagementLevel'])
y = df['EngagementLevel']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Random Forest Classification model on the Training set

In [15]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [16]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix and Classification Report

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1765   65  205]
 [  66 1792  235]
 [ 150  170 3559]]


0.8887223679280629

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        High       0.89      0.87      0.88      2035
         Low       0.88      0.86      0.87      2093
      Medium       0.89      0.92      0.90      3879

    accuracy                           0.89      8007
   macro avg       0.89      0.88      0.88      8007
weighted avg       0.89      0.89      0.89      8007

