# World of Warcraft 

Data source from: [kaggle](https://www.kaggle.com/datasets/mylesoneill/warcraft-avatar-history?resource=download)

EDA


In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from src.utils import prep_parquet

In [2]:
org_csv_file_path = "./data/wowah_data.csv"
parquet_file_path = "./data/wowah_data.parquet"

if os.path.exists(parquet_file_path):
    df = pd.read_parquet("./data/wowah_data.parquet")
else:
    prep_parquet(org_csv_file_path, playtime=1)
    df = pd.read_parquet("./data/wowah_data.parquet")

df.head()

Unnamed: 0,char,level,race,charclass,zone,guild,timestamp
0,59425,1,Orc,Rogue,Orgrimmar,165,2008-01-01 00:02:04
1,65494,9,Orc,Hunter,Durotar,-1,2008-01-01 00:02:04
2,65325,14,Orc,Warrior,Ghostlands,-1,2008-01-01 00:02:04
3,65490,18,Orc,Hunter,Ghostlands,-1,2008-01-01 00:02:04
4,2288,60,Orc,Hunter,Hellfire Peninsula,-1,2008-01-01 00:02:09


In [3]:
print(df.shape)

(10817778, 7)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10817778 entries, 0 to 10817777
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   char       int64         
 1   level      int64         
 2   race       object        
 3   charclass  object        
 4   zone       object        
 5   guild      int64         
 6   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 577.7+ MB


## Understand features

Analysis around data distribution 


Remove players who has only 1 record

- We can use the player data to understand the early churn of players

In [5]:
print("From ", np.min(df["timestamp"]), "to", np.max(df["timestamp"]))

From  2008-01-01 00:02:04 to 2008-12-31 23:50:18


In [6]:
print("Number of unique players", len(df["char"].unique()))

Number of unique players 28398


In [7]:
df["year"] = df["timestamp"].apply(lambda x: pd.Timestamp(x).year)
df["month"] = df["timestamp"].apply(lambda x: pd.Timestamp(x).month)
df["day"] = df["timestamp"].apply(lambda x: pd.Timestamp(x).day)
df.head()

Unnamed: 0,char,level,race,charclass,zone,guild,timestamp,year,month,day
0,59425,1,Orc,Rogue,Orgrimmar,165,2008-01-01 00:02:04,2008,1,1
1,65494,9,Orc,Hunter,Durotar,-1,2008-01-01 00:02:04,2008,1,1
2,65325,14,Orc,Warrior,Ghostlands,-1,2008-01-01 00:02:04,2008,1,1
3,65490,18,Orc,Hunter,Ghostlands,-1,2008-01-01 00:02:04,2008,1,1
4,2288,60,Orc,Hunter,Hellfire Peninsula,-1,2008-01-01 00:02:09,2008,1,1


Number of player over time

In [None]:
df.groupby(["year", "month"])

How frequency play the game 
- Average play times
- 

In [None]:
AVERAGEPLAYCOUNT = int(np.mean(df.groupby(["char"]).size()))
print(f"Average number of play time is {AVERAGEPLAYCOUNT}")

In [None]:
print(len(df.groupby(["char"]).size()[df.groupby(["char"]).size() > AVERAGEPLAYCOUNT]), "players are played the game more than", AVERAGEPLAYCOUNT, "times.")

In [None]:
# get the most top 10 players
top10players = df.groupby(["char"]).size().sort_values(ascending=False).index[:10]
top10players

In [None]:
top10player_history = df.query("char in @top10players")
top10player_history.groupby(["char", "level"]).size().unstack().plot.bar(stacked=True, figsize=(16,10))

In [None]:
# get the player who is played the game most

theplayer = df.groupby(["char"]).size()[df.groupby(["char"]).size() == 42801].index[0]
df[df["char"]==10].groupby(["char", "level"]).size()

In [None]:
df.groupby(["char"]).size()

In [None]:
df.groupby(["char"]).size()[df.groupby(["char"]).size()==1]

In [None]:
df[df['char'] == 2]

In [None]:
# df["char"].value_counts()
df.groupby(["char"]).size()[df.groupby(["char"]).size()>1000]

In [None]:
df[df["char"] == 87853]