In [1]:
import os
import pandas as pd
import seaborn as sns
%cd ..
from src.data.dataset import load_kaggle_data

d:\Projects\my-ds-template


Prepare datasets for further analysis/modeling.  
Example based on 3 tables of characters information from the video game Genshin Impact.

## 1. Loading data
- Retrieving (a subset of) rows
- Sorting
- Testing equality, monotonicity
- Column operations


In [43]:
dataset_path = 'https://www.kaggle.com/datasets/genshinplayer/genshin-impact-characters-stats'
path, files = load_kaggle_data(dataset_path)
df1 = pd.read_csv(os.path.join(path,files[0])) # skiprows, nrows, header
df1[df1.Character=='Amber']

Skipping, found downloaded files in "../data/raw/genshin-impact-characters-stats" (use force=True to force download)


Unnamed: 0,Character,Lv,Rarity,Element,Weapon,Main role,Ascension,Base HP,Base ATK,Base DEF
0,Amber,1,4,Pyro,Bow,Sub DPS,ATK,793,19,50
1,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2038,48,129
2,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2630,62,167
3,Amber,40,4,Pyro,Bow,Sub DPS,ATK,3940,93,250
4,Amber,40,4,Pyro,Bow,Sub DPS,ATK,4361,103,277
5,Amber,50,4,Pyro,Bow,Sub DPS,ATK,5016,118,318
6,Amber,50,4,Pyro,Bow,Sub DPS,ATK,5578,131,354
7,Amber,60,4,Pyro,Bow,Sub DPS,ATK,6233,147,396
8,Amber,60,4,Pyro,Bow,Sub DPS,ATK,6654,157,422
9,Amber,70,4,Pyro,Bow,Sub DPS,ATK,7309,172,464


This table contains information of each Character from Lv1 to Lv80.  
For each level from Lv20 to Lv 80, there are 2 rows with different HP, ATK & DEF, assumed before/after Ascension.  

In [44]:
# Test Character has identical levels
from pandas import testing as tm
for char, c_df in df1.groupby('Character'):
    # Verify that Lv column for each Character is the same
    tm.assert_series_equal(c_df['Lv'], df1.head(14)['Lv'], check_index=False)
    # Verify that the order of the rows is always before Ascension -> after Ascension
    assert c_df['Base HP'].is_monotonic_increasing==True

No AssertionError, assumption verified.   
If the rows were not in order, use .sort_values().reset_index(drop=True). 

In [45]:
# Sample all rows from df1 will disrupt the order
# .sort_values method supports multiple columns
# it will sort values according to the first column
# if there are identical values, then refer to the next column.
df1.sample(len(df1)).sort_values(by=['Base ATK','Character']).reset_index(drop=True)  

Unnamed: 0,Character,Lv,Rarity,Element,Weapon,Main role,Ascension,Base HP,Base ATK,Base DEF
0,Hutao,1,5,Pyro,Polearm,DPS,CRIT DMG,1211,8,68
1,Barbara,1,4,Hydro,Catalyst,Healer,HP,821,13,56
2,Sucrose,1,4,Anemo,Catalyst,Support,Anemo DMG,775,14,59
3,Bennett,1,4,Pyro,Sword,Healer,Energy Recharge,1039,16,65
4,Noelle,1,4,Geo,Claymore,Support,DEF,1012,16,67
...,...,...,...,...,...,...,...,...,...,...
569,Ganyu,90,5,Cryo,Bow,DPS,CRIT DMG,9797,335,630
570,Shogun,90,5,Electro,Polearm,Sub DPS,Energy Recharge,12907,337,789
571,Ayaka,90,5,Cryo,Sword,DPS,CRIT DMG,12858,342,784
572,Eula,90,5,Cryo,Claymore,DPS,CRIT DMG,13226,342,751


In [46]:
# Rename columns. Good names are important!
df1 = df1.rename({'Ascension':'Ascension Stat'},axis=1)
# Create a new column representing whether it is before or after Ascension
df1['Ascension'] = len(set(df1.Character))*([pd.NA]+[0,1]*6+[pd.NA])
df1.head(5)

Unnamed: 0,Character,Lv,Rarity,Element,Weapon,Main role,Ascension Stat,Base HP,Base ATK,Base DEF,Ascension
0,Amber,1,4,Pyro,Bow,Sub DPS,ATK,793,19,50,
1,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2038,48,129,0.0
2,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2630,62,167,1.0
3,Amber,40,4,Pyro,Bow,Sub DPS,ATK,3940,93,250,0.0
4,Amber,40,4,Pyro,Bow,Sub DPS,ATK,4361,103,277,1.0


In [50]:
# Second data source include additional information on Region and Model Type
url= 'https://genshin-impact.fandom.com/wiki/Characters'
df2 = pd.read_html(url)[2]
df2 = df2.drop(columns=['Icon','Rarity'])
df2 = df2.rename({'Name':'Character'},
                 axis=1)
df2.sample(3)  #  head, tail

Unnamed: 0,Character,Element,Weapon,Region,Model Type
11,Fischl,Electro,Bow,Mondstadt,Medium Female
15,Jean,Anemo,Sword,Mondstadt,Tall Female
27,Noelle,Geo,Claymore,Mondstadt,Medium Female


In [79]:
# Third data source include stats of more Characters at Lv90
url= 'https://genshin-impact.fandom.com/wiki/Characters/Comparison'
df3 = pd.read_html(url)[1]
df3 = df3.drop(columns=['Icon'])
df3.rename(columns={'Name':'Character',
                    'HP': 'Base HP',
                    'ATK': 'Base ATK',
                    'DEF': 'Base DEF',
                    }, 
           inplace=True)
df3.head(4)

Unnamed: 0,Character,Base HP,Base ATK,Base DEF,Ascension Stat,Ascension Stat Value
0,Albedo,13226,251,876,Geo DMG Bonus,28.8%
1,Aloy,10899,234,676,Cryo DMG Bonus,28.8%
2,Amber,9461,223,601,ATK,24.0%
3,Arataki Itto,12858,227,959,CRIT Rate,19.2%


## 2. Working with multiple tables
- String processing
- Union, Intersection, Difference, Symmetric Difference
- Merge (Join) multiple tables  
    Here's a [helpful question on Stack Overflow](https://stackoverflow.com/questions/53645882/pandas-merging-101/53645883#53645883).

To merge `df1` and `df2`, we need to create an unique key to match the rows to the characters.

In [80]:
# Table 1: 
# Only first name for japanese names
df1.Character.unique()

array(['Amber', 'Barbara', 'Beidou', 'Bennett', 'Chongyun', 'Diluc',
       'Fischl', 'Jean', 'Kaeya', 'Keqing', 'Klee', 'Lisa', 'Mona',
       'Ningguang', 'Noelle', 'Qiqi', 'Razor', 'Sucrose', 'Venti',
       'Xiangling', 'Xiao', 'Xingqiu', 'Tartaglia', 'Zhongli', 'Diona',
       'Xinyan', 'Ganyu', 'Albedo', 'Rosaria', 'Ayaka', 'Hutao', 'Yanfei',
       'Eula', 'Kazuha', 'Yoimiya', 'Sayu', 'Traveler', 'Shogun', 'Aloy',
       'Sara', 'Kokomi'], dtype=object)

In [66]:
# Table 2&3 have identical Characters
assert (df3.Character.unique() == df2.Character.unique()).all()
df3.Character.unique()

array(['Albedo', 'Aloy', 'Amber', 'Arataki Itto', 'Barbara', 'Beidou',
       'Bennett', 'Chongyun', 'Diluc', 'Diona', 'Eula', 'Fischl', 'Ganyu',
       'Gorou', 'Hu Tao', 'Jean', 'Kaedehara Kazuha', 'Kaeya',
       'Kamisato Ayaka', 'Kamisato Ayato', 'Keqing', 'Klee', 'Kujou Sara',
       'Kuki Shinobu', 'Lisa', 'Mona', 'Ningguang', 'Noelle', 'Qiqi',
       'Raiden Shogun', 'Razor', 'Rosaria', 'Sangonomiya Kokomi', 'Sayu',
       'Shenhe', 'Shikanoin Heizou', 'Sucrose', 'Tartaglia', 'Thoma',
       'Traveler', 'Venti', 'Xiangling', 'Xiao', 'Xingqiu', 'Xinyan',
       'Yae Miko', 'Yanfei', 'Yelan', 'Yoimiya', 'Yun Jin', 'Zhongli'],
      dtype=object)

We can use the lowercase of the full names without spaces as ID. 

In [92]:
# Create unique IDs for characters based on their full names in table 2
df2['ID'] = df2['Character'].transform(lambda x: x.replace(' ','').lower())
df3['ID'] = df3['Character'].transform(lambda x: x.replace(' ','').lower())

# Assign ID to characters in table 1
def get_id(x, id_list):
    for id in id_list:
        if x.lower() in id:
            return id
    return x
df1['ID'] = df1['Character'].apply(lambda x: get_id(x, df2['ID'].unique()))

In [82]:
# Intersection, Difference, Union of sets
len(set(df2.ID)|set(df1.ID)) # &, -, | 

51

In [85]:
# All ID in df1 can be found in df2
set(df1.ID) - set(df2.ID)

set()

In [86]:
set(df2.columns) & set(df1.columns)

{'Character', 'Element', 'ID', 'Weapon'}

Before we merge everything together, IMO there is a clear difference of potential use between a table with only lv90 data and a table with data from lv1 to lv90. So I will separately store them.

In [93]:
df2['ID'] = df2['Character'].transform(lambda x: x.replace(' ','').lower())
columns_from_df2 = ['ID']+list(set(df2.columns) - set(df1.columns))
df_growth = pd.merge(df1, df2[columns_from_df2], on='ID', how='left')
df_growth = df_growth.set_index('ID').reset_index()
df_growth

Unnamed: 0,ID,Character,Lv,Rarity,Element,Weapon,Main role,Ascension Stat,Base HP,Base ATK,Base DEF,Ascension,Model Type,Region
0,amber,Amber,1,4,Pyro,Bow,Sub DPS,ATK,793,19,50,,Medium Female,Mondstadt
1,amber,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2038,48,129,0,Medium Female,Mondstadt
2,amber,Amber,20,4,Pyro,Bow,Sub DPS,ATK,2630,62,167,1,Medium Female,Mondstadt
3,amber,Amber,40,4,Pyro,Bow,Sub DPS,ATK,3940,93,250,0,Medium Female,Mondstadt
4,amber,Amber,40,4,Pyro,Bow,Sub DPS,ATK,4361,103,277,1,Medium Female,Mondstadt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,sangonomiyakokomi,Kokomi,70,5,Hydro,Catalyst,Healer,Hydro DMG,10306,179,503,0,Medium Female,Inazuma
570,sangonomiyakokomi,Kokomi,70,5,Hydro,Catalyst,Healer,Hydro DMG,10945,190,534,1,Medium Female,Inazuma
571,sangonomiyakokomi,Kokomi,80,5,Hydro,Catalyst,Healer,Hydro DMG,11885,207,580,0,Medium Female,Inazuma
572,sangonomiyakokomi,Kokomi,80,5,Hydro,Catalyst,Healer,Hydro DMG,12524,218,611,1,Medium Female,Inazuma


In [101]:
df_lv_90 = df3
columns_from_df2 = ['ID']+list(set(df2.columns) - set(df3.columns))
df_lv_90 = pd.merge(df3, df2[columns_from_df2], on='ID', how='left')
df_lv_90 = df_lv_90.set_index('ID').reset_index()
df_lv_90

Unnamed: 0,ID,Character,Base HP,Base ATK,Base DEF,Ascension Stat,Ascension Stat Value,Element,Weapon,Model Type,Region
0,albedo,Albedo,13226,251,876,Geo DMG Bonus,28.8%,Geo,Sword,Medium Male,Mondstadt
1,aloy,Aloy,10899,234,676,Cryo DMG Bonus,28.8%,Cryo,Bow,Medium Female,
2,amber,Amber,9461,223,601,ATK,24.0%,Pyro,Bow,Medium Female,Mondstadt
3,aratakiitto,Arataki Itto,12858,227,959,CRIT Rate,19.2%,Geo,Claymore,Tall Male,Inazuma
4,barbara,Barbara,9787,159,669,HP,24.0%,Hydro,Catalyst,Medium Female,Mondstadt
5,beidou,Beidou,13050,225,648,Electro DMG Bonus,24.0%,Electro,Claymore,Tall Female,Liyue
6,bennett,Bennett,12397,191,771,Energy Recharge,26.7%,Pyro,Sword,Medium Male,Mondstadt
7,chongyun,Chongyun,10984,223,648,ATK,24.0%,Cryo,Claymore,Medium Male,Liyue
8,diluc,Diluc,12981,335,784,CRIT Rate,19.2%,Pyro,Claymore,Tall Male,Mondstadt
9,diona,Diona,9570,212,601,Cryo DMG Bonus,24.0%,Cryo,Bow,Short Female,Mondstadt


## 3. Update data
- Check & fill missing values (na/null)  
- Update cells according to index and label


In [142]:
# Find mismatched values
columns = ['ID','Base HP','Base ATK','Base DEF']

hp_df1 = df_lv_90[columns].copy()
hp_df1['tabel'] = 'lv 90'

hp_df3 = df_growth[df_growth.Lv==90][columns].copy()
hp_df3['tabel'] = 'growth'

pd.concat([hp_df1, hp_df3], axis=0).drop_duplicates(subset=columns, keep=False).sort_values('ID')

Unnamed: 0,ID,Base HP,Base ATK,Base DEF,tabel
3,aratakiitto,12858,227,959,lv 90
13,gorou,9570,183,648,lv 90
19,kamisatoayato,13715,299,769,lv 90
23,kukishinobu,12289,212,751,lv 90
34,shenhe,12993,304,830,lv 90
35,shikanoinheizou,10657,225,684,lv 90
36,sucrose,9243,170,703,lv 90
251,sucrose,9244,170,703,growth
38,thoma,10331,202,751,lv 90
43,xingqiu,10223,202,758,lv 90


In [143]:
df_growth.isna().sum()

ID                 0
Character          0
Lv                 0
Rarity             0
Element            0
Weapon             0
Main role          0
Ascension Stat     0
Base HP            0
Base ATK           0
Base DEF           0
Ascension         82
Model Type         0
Region            28
dtype: int64

In [147]:
df_lv_90.isna().sum()

ID                      0
Character               0
Base HP                 0
Base ATK                0
Base DEF                0
Ascension Stat          0
Ascension Stat Value    0
Element                 0
Weapon                  0
Model Type              0
Region                  2
dtype: int64

In [150]:
df_lv_90[df_lv_90.Region.isna()]

Unnamed: 0,ID,Character,Base HP,Base ATK,Base DEF,Ascension Stat,Ascension Stat Value,Element,Weapon,Model Type,Region
1,aloy,Aloy,10899,234,676,Cryo DMG Bonus,28.8%,Cryo,Bow,Medium Female,
39,traveler,Traveler,10875,212,683,ATK,24.0%,,Sword,Aether: Medium MaleLumine: Medium Female,


In [165]:
# Sometimes, if we have to, we can update values manually.
# Difference between loc & iloc see https://stackoverflow.com/questions/31593201/how-are-iloc-and-loc-different
df_lv_90.loc[[1,39], ['Region']] = [['Unknow'], ['Nora']]
df_lv_90.iloc[[1,39]]

Unnamed: 0,ID,Character,Base HP,Base ATK,Base DEF,Ascension Stat,Ascension Stat Value,Element,Weapon,Model Type,Region
1,aloy,Aloy,10899,234,676,Cryo DMG Bonus,28.8%,Cryo,Bow,Medium Female,Unknow
39,traveler,Traveler,10875,212,683,ATK,24.0%,,Sword,Aether: Medium MaleLumine: Medium Female,Nora


In [221]:
df_growth[df_growth['ID']=='albedo'].head(1)[['Rarity','Main role']].values

array([[5, 'Support']], dtype=object)

In [227]:
df_lv_90[['Rarity','Main role']] = df_lv_90.apply(lambda row: df_growth[df_growth['ID']==row['ID']].head(1)[['Rarity','Main role']].values.flatten() if row['ID'] in df_growth.ID.unique() else pd.NA*2, result_type='expand', axis=1)

In [185]:
df_growth.to_csv('data/processed/genshin_character_growth.csv', index=False)
df_lv_90.to_csv('data/processed/genshin_character_lv_90.csv', index=False)

## Checklist
- Assumptions  
  1. Names in df1 are substrings from df2 after removing spaces and lowercaseization.
- Missing values  
  - Traveler and Aloe don't have Region label.  
  - Lv1 and Lv90 rows don't have Ascension label.


In [141]:
# query detailed information for each character on fandom.com
url= 'https://genshin-impact.fandom.com/wiki/Razor'
df = pd.read_html(url)[2]
df.head(3)

Unnamed: 0,AscensionPhase,Level,BaseHP,BaseATK1,BaseDEF,Special Stat2(Physical DMG Bonus)
0,0✦,1/20,1003,20,63,—
1,0✦,20/20,2577,50,162,—
2,"Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va...","Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va...","Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va...","Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va...","Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va...","Ascension Cost (0 → 1)20,000 20,000 Mora1 1 Va..."
