# Data Science Project: Planning Stage (Individual)

## Predicting Player Engagement in MineCraft Research Server

This project analyzes player behavior data from a MineCraft research server to understand patterns and predict player engagement levels.

## 1. Data Loading and Initial Exploration

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

### Load the datasets

In [2]:
# Load players data
players_df = pd.read_csv('players.csv')
print(f"Players dataset shape: {players_df.shape}")
print(f"Number of unique players: {players_df['hashedEmail'].nunique()}")
print("\nFirst 5 rows of players data:")
players_df.head()

Players dataset shape: (196, 7)
Number of unique players: 196

First 5 rows of players data:


Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,Age
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9.0
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17.0
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17.0
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21.0
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21.0


In [3]:
# Load sessions data
sessions_df = pd.read_csv('sessions.csv')
print(f"Sessions dataset shape: {sessions_df.shape}")
print(f"Number of unique players in sessions: {sessions_df['hashedEmail'].nunique()}")
print("\nFirst 5 rows of sessions data:")
sessions_df.head()

Sessions dataset shape: (1535, 5)
Number of unique players in sessions: 125

First 5 rows of sessions data:


Unnamed: 0,hashedEmail,start_time,end_time,original_start_time,original_end_time
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,30/06/2024 18:12,30/06/2024 18:24,1719770000000.0,1719770000000.0
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,17/06/2024 23:33,17/06/2024 23:46,1718670000000.0,1718670000000.0
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,25/07/2024 17:34,25/07/2024 17:57,1721930000000.0,1721930000000.0
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,25/07/2024 03:22,25/07/2024 03:58,1721880000000.0,1721880000000.0
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,25/05/2024 16:01,25/05/2024 16:12,1716650000000.0,1716650000000.0


### Data Types and Basic Information

In [4]:
# Check data types for players dataset
print("Players Dataset Info:")
print("=" * 50)
players_df.info()
print("\n" + "=" * 50)

Players Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   experience    196 non-null    object 
 1   subscribe     196 non-null    bool   
 2   hashedEmail   196 non-null    object 
 3   played_hours  196 non-null    float64
 4   name          196 non-null    object 
 5   gender        196 non-null    object 
 6   Age           194 non-null    float64
dtypes: bool(1), float64(2), object(4)
memory usage: 9.5+ KB



In [5]:
# Check data types for sessions dataset
print("Sessions Dataset Info:")
print("=" * 50)
sessions_df.info()
print("\n" + "=" * 50)

Sessions Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1535 entries, 0 to 1534
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   hashedEmail          1535 non-null   object 
 1   start_time           1535 non-null   object 
 2   end_time             1533 non-null   object 
 3   original_start_time  1535 non-null   float64
 4   original_end_time    1533 non-null   float64
dtypes: float64(2), object(3)
memory usage: 60.1+ KB



### Check for Missing Values

In [6]:
# Check missing values in players dataset
print("Missing Values in Players Dataset:")
print("=" * 40)
players_missing = players_df.isnull().sum()
players_missing_pct = 100 * players_df.isnull().sum() / len(players_df)
players_missing_table = pd.DataFrame({
    'Missing_Count': players_missing,
    'Percentage': players_missing_pct
})
print(players_missing_table[players_missing_table['Missing_Count'] > 0])
if players_missing_table['Missing_Count'].sum() == 0:
    print("No missing values found in players dataset!")

Missing Values in Players Dataset:
     Missing_Count  Percentage
Age              2        1.02


In [7]:
# Check missing values in sessions dataset
print("Missing Values in Sessions Dataset:")
print("=" * 40)
sessions_missing = sessions_df.isnull().sum()
sessions_missing_pct = 100 * sessions_df.isnull().sum() / len(sessions_df)
sessions_missing_table = pd.DataFrame({
    'Missing_Count': sessions_missing,
    'Percentage': sessions_missing_pct
})
print(sessions_missing_table[sessions_missing_table['Missing_Count'] > 0])
if sessions_missing_table['Missing_Count'].sum() == 0:
    print("No missing values found in sessions dataset!")

Missing Values in Sessions Dataset:
                   Missing_Count  Percentage
end_time                       2        0.13
original_end_time              2        0.13


### Data Quality Checks

In [8]:
# Check for duplicate players
duplicate_players = players_df['hashedEmail'].duplicated().sum()
print(f"Number of duplicate player records: {duplicate_players}")

# Check data consistency between datasets
players_in_sessions = set(sessions_df['hashedEmail'].unique())
all_players = set(players_df['hashedEmail'].unique())

# Players with sessions but not in players table
orphan_sessions = players_in_sessions - all_players
print(f"\nPlayers in sessions but not in players table: {len(orphan_sessions)}")

# Players without any sessions
players_without_sessions = all_players - players_in_sessions
print(f"Players without any sessions: {len(players_without_sessions)}")

Number of duplicate player records: 0

Players in sessions but not in players table: 0
Players without any sessions: 71


### Categorical Variables Distribution

In [9]:
# Check unique values in categorical columns
categorical_cols = ['experience', 'gender', 'subscribe']

print("Categorical Variables Distribution:")
print("=" * 50)
for col in categorical_cols:
    print(f"\n{col}:")
    print(players_df[col].value_counts())
    print(f"Unique values: {players_df[col].nunique()}")

Categorical Variables Distribution:

experience:
experience
Amateur     63
Veteran     48
Regular     36
Beginner    35
Pro         14
Name: count, dtype: int64
Unique values: 5

gender:
gender
Male                 124
Female                37
Non-binary            15
Prefer not to say     11
Two-Spirited           6
Agender                2
Other                  1
Name: count, dtype: int64
Unique values: 7

subscribe:
subscribe
True     144
False     52
Name: count, dtype: int64
Unique values: 2


### Numerical Variables Summary

In [10]:
# Basic statistics for numerical columns in players
numerical_cols = ['played_hours', 'Age']
print("Numerical Variables Summary Statistics:")
print("=" * 50)
players_df[numerical_cols].describe()

Numerical Variables Summary Statistics:


Unnamed: 0,played_hours,Age
count,196.0,194.0
mean,5.85,21.14
std,28.36,7.39
min,0.0,9.0
25%,0.0,17.0
50%,0.1,19.0
75%,0.6,22.75
max,223.1,58.0


### Session Data Time Processing

In [11]:
# Convert time columns to datetime
sessions_df['start_time'] = pd.to_datetime(sessions_df['start_time'], format='%d/%m/%Y %H:%M')
sessions_df['end_time'] = pd.to_datetime(sessions_df['end_time'], format='%d/%m/%Y %H:%M')

# Calculate session duration in minutes
sessions_df['session_duration_minutes'] = (sessions_df['end_time'] - sessions_df['start_time']).dt.total_seconds() / 60

print("Session Duration Statistics (in minutes):")
print(sessions_df['session_duration_minutes'].describe())

# Check for any negative durations (data quality issue)
negative_durations = sessions_df[sessions_df['session_duration_minutes'] < 0]
print(f"\nSessions with negative duration: {len(negative_durations)}")

Session Duration Statistics (in minutes):
count   1533.00
mean      50.86
std       55.57
min        3.00
25%        9.00
50%       30.00
75%       73.00
max      259.00
Name: session_duration_minutes, dtype: float64

Sessions with negative duration: 0


## Next Steps

The data has been successfully loaded and initial exploration completed. The next steps will include:
1. Comprehensive data description and variable documentation
2. Exploratory data analysis with visualizations
3. Feature engineering and data transformation
4. Research question formulation and methodology