# Data Science Project: Planning Stage (Individual)

## Predicting Player Engagement in MineCraft Research Server

This project analyzes player behavior data from a MineCraft research server to understand patterns and predict player engagement levels.

## 1. Data Loading and Initial Exploration

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

### Load the datasets

In [None]:
# Read the sessions xlsx file
sessions_df = pd.read_excel('sessions (2).xlsx')

# Display the first few rows
print(f"Sessions data shape: {sessions_df.shape}")
sessions_df.head()

In [3]:
# Convert to CSV
sessions_df.to_csv('sessions.csv', index=False)
print("✓ sessions.csv created successfully")

✓ sessions.csv created successfully


## Convert players.xlsx to CSV

In [4]:
# Read the players xlsx file
players_df = pd.read_excel('players.xlsx')

# Display the first few rows
print(f"Players data shape: {players_df.shape}")
players_df.head()

Players data shape: (196, 7)


Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,Age
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9.0
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17.0
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17.0
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21.0
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21.0


In [None]:
# Load sessions data
sessions_df = pd.read_csv('sessions.csv')
print(f"Sessions dataset shape: {sessions_df.shape}")
print(f"Number of unique players in sessions: {sessions_df['hashedEmail'].nunique()}")
print("\nFirst 5 rows of sessions data:")
sessions_df.head()

### Data Types and Basic Information

In [None]:
# Check data types for players dataset
print("Players Dataset Info:")
print("=" * 50)
players_df.info()
print("\n" + "=" * 50)

In [None]:
# Check data types for sessions dataset
print("Sessions Dataset Info:")
print("=" * 50)
sessions_df.info()
print("\n" + "=" * 50)

### Check for Missing Values

In [None]:
# Check missing values in players dataset
print("Missing Values in Players Dataset:")
print("=" * 40)
players_missing = players_df.isnull().sum()
players_missing_pct = 100 * players_df.isnull().sum() / len(players_df)
players_missing_table = pd.DataFrame({
    'Missing_Count': players_missing,
    'Percentage': players_missing_pct
})
print(players_missing_table[players_missing_table['Missing_Count'] > 0])
if players_missing_table['Missing_Count'].sum() == 0:
    print("No missing values found in players dataset!")

In [None]:
# Check missing values in sessions dataset
print("Missing Values in Sessions Dataset:")
print("=" * 40)
sessions_missing = sessions_df.isnull().sum()
sessions_missing_pct = 100 * sessions_df.isnull().sum() / len(sessions_df)
sessions_missing_table = pd.DataFrame({
    'Missing_Count': sessions_missing,
    'Percentage': sessions_missing_pct
})
print(sessions_missing_table[sessions_missing_table['Missing_Count'] > 0])
if sessions_missing_table['Missing_Count'].sum() == 0:
    print("No missing values found in sessions dataset!")

### Data Quality Checks

In [None]:
# Check for duplicate players
duplicate_players = players_df['hashedEmail'].duplicated().sum()
print(f"Number of duplicate player records: {duplicate_players}")

# Check data consistency between datasets
players_in_sessions = set(sessions_df['hashedEmail'].unique())
all_players = set(players_df['hashedEmail'].unique())

# Players with sessions but not in players table
orphan_sessions = players_in_sessions - all_players
print(f"\nPlayers in sessions but not in players table: {len(orphan_sessions)}")

# Players without any sessions
players_without_sessions = all_players - players_in_sessions
print(f"Players without any sessions: {len(players_without_sessions)}")

### Categorical Variables Distribution

In [None]:
# Check unique values in categorical columns
categorical_cols = ['experience', 'gender', 'subscribe']

print("Categorical Variables Distribution:")
print("=" * 50)
for col in categorical_cols:
    print(f"\n{col}:")
    print(players_df[col].value_counts())
    print(f"Unique values: {players_df[col].nunique()}")

### Numerical Variables Summary

In [None]:
# Basic statistics for numerical columns in players
numerical_cols = ['played_hours', 'Age']
print("Numerical Variables Summary Statistics:")
print("=" * 50)
players_df[numerical_cols].describe()

### Session Data Time Processing

In [None]:
# Convert time columns to datetime
sessions_df['start_time'] = pd.to_datetime(sessions_df['start_time'], format='%d/%m/%Y %H:%M')
sessions_df['end_time'] = pd.to_datetime(sessions_df['end_time'], format='%d/%m/%Y %H:%M')

# Calculate session duration in minutes
sessions_df['session_duration_minutes'] = (sessions_df['end_time'] - sessions_df['start_time']).dt.total_seconds() / 60

print("Session Duration Statistics (in minutes):")
print(sessions_df['session_duration_minutes'].describe())

# Check for any negative durations (data quality issue)
negative_durations = sessions_df[sessions_df['session_duration_minutes'] < 0]
print(f"\nSessions with negative duration: {len(negative_durations)}")

## Next Steps

The data has been successfully loaded and initial exploration completed. The next steps will include:
1. Comprehensive data description and variable documentation
2. Exploratory data analysis with visualizations
3. Feature engineering and data transformation
4. Research question formulation and methodology