## import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

## load dataset

In [2]:
raw = pd.read_csv('steam_games.csv')
raw.sample(5)

Unnamed: 0,title,url,image,release_date,platforms,discount_rate,original_price,discounted_price,developer,publisher,overall_reviews,text_reviews,description,tags,processor,ram,graphic_card,rating,language,metacriticts
18639,Solo Fox,https://store.steampowered.com/app/1525650/Sol...,https://cdn.akamai.steamstatic.com/steam/apps/...,"13 Feb, 2021",Windows,,Rp 24 999,,"""RevDay Studio""","""RevDay Studio""",Positive,- 100% of the 13 user reviews for this game ar...,2D Indie-platformer about the fox Solo.,"Adventure,2D Platformer,2D,Platformer,Linear,N...",intel Atom,256 MB RAM,,,"English,French,German,Spanish - Spain,Russian",
40504,Demon Hunter,https://store.steampowered.com/app/535530/Demo...,https://cdn.akamai.steamstatic.com/steam/apps/...,"29 Sep, 2016",Windows,,Rp 135 999,,"Beijing Skyline Interaction Technology Co., Ltd.","Beijing Skyline Interaction Technology Co., Ltd.",8 user reviews,- Need more user reviews to generate a score,Demon hunter it is a kind of virtual reality R...,"Action,Adventure,RPG,VR",i5 6500,4 GB RAM,GTX970,,English,
50159,东方蝶梦志 交响组曲 ~ 梦中的钧天广乐,https://store.steampowered.com/app/941440/_/?s...,https://cdn.akamai.steamstatic.com/steam/apps/...,"14 Sep, 2018",,,Rp 24 999,,弦语蝶梦,弦语蝶梦,5 user reviews,- Need more user reviews to generate a score,,"Action,Indie,Casual",,,,,,
38698,欢迎回家-Welcome Home,https://store.steampowered.com/app/1870020/Wel...,https://cdn.akamai.steamstatic.com/steam/apps/...,"14 Feb, 2022",Windows,,Rp 52 999,,橙光游戏,橙光游戏,,,This is a secret room to escape burning brain ...,"Interactive Fiction,Puzzle,Casual,Mystery,Adve...",1Ghz or faster processor,1 GB RAM,512MB VRAM,,"English,Simplified Chinese",
15837,Aztez,https://store.steampowered.com/app/244750/Azte...,https://cdn.akamai.steamstatic.com/steam/apps/...,"1 Aug, 2017","Windows,Mac OS,Linux",,Rp 135 999,,Team Colorblind,Team Colorblind,Mostly Positive,- 76% of the 130 user reviews for this game ar...,"Aztez is a highly stylized, brutally satisfyin...","Action,Indie,Strategy,Beat 'em up,Character Ac...",2.4 GHz Dual core,2 GB RAM,"Intel HD 2000, NVIDIA GeForce 210, or ATI 4850",Rating for: PEGI,English,81.0


## data understanding

In [3]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68468 entries, 0 to 68467
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             68468 non-null  object 
 1   url               68468 non-null  object 
 2   image             68468 non-null  object 
 3   release_date      62160 non-null  object 
 4   platforms         66194 non-null  object 
 5   discount_rate     5401 non-null   object 
 6   original_price    67795 non-null  object 
 7   discounted_price  5401 non-null   object 
 8   developer         61855 non-null  object 
 9   publisher         56815 non-null  object 
 10  overall_reviews   55479 non-null  object 
 11  text_reviews      55479 non-null  object 
 12  description       40956 non-null  object 
 13  tags              61039 non-null  object 
 14  processor         55299 non-null  object 
 15  ram               55091 non-null  object 
 16  graphic_card      51361 non-null  object

Observations:
- `url`,`image`,`discount_rate`,`discounted_price`,`description` drop
- `release_date` extract month and year
- `platfoms`,`developer`,`publisher`,`tags` split
- `original_price` change data type (extract price)
- `text_reviews` extract rating (`user_rating`) and total user reviews (`total_reviews`)
- `language` extract number of language (`language`)
- `metacriticts` too much missing values, drop

In [4]:
raw.describe()

Unnamed: 0,metacriticts
count,1881.0
mean,80.971823
std,4.458169
min,75.0
25%,77.0
50%,80.0
75%,84.0
max,97.0


In [5]:
raw.describe(include='object')

Unnamed: 0,title,url,image,release_date,platforms,discount_rate,original_price,discounted_price,developer,publisher,overall_reviews,text_reviews,description,tags,processor,ram,graphic_card,rating,language
count,68468,68468,68468,62160,66194,5401,67795,5401,61855,56815,55479,55479,40956,61039,55299,55091,51361,13428,59912
unique,64887,68468,65033,3894,12,89,4591,1500,24765,20134,18,13440,39011,40137,12423,859,14567,1,7879
top,Game + Soundtrack,https://store.steampowered.com/app/730/Counter...,https://cdn.akamai.steamstatic.com/steam/apps/...,"14 Oct, 2020",Windows,-50%,Rp 69 999,Rp 4 249,Ubisoft - San Francisco,Degica,Very Positive,- Need more user reviews to generate a score,Minimal physical puzzle with explosions,Action,2.66 GHz Intel Core2 Duo E6750 or 2.8 GHz AMD ...,4 GB RAM,256 MB DirectX 9 / NVIDIA® GeForce® 8600 GT or...,Rating for: PEGI,English
freq,13,1,6,146,44961,1006,5108,329,1167,1049,11078,15169,15,1715,801,13411,1167,13428,24115


observations:
- `rating` data is wrong, consider drop or rescrape data
- `tags` too much unique value, consider to create new columns tag that has been aggregated

## data preprocessing (format)

### drop duplicates and uninformative columns

In [6]:
df_drop = raw.copy()
df_drop.drop_duplicates(subset='title',inplace=True)
df_drop['title'].duplicated().sum()

0

In [7]:
drop = ['url','image','discount_rate','discounted_price','description','metacriticts','processor','graphic_card'] #these variables unable to show any kind of valuable information that can be used for current analysis and machine learning model, consider to be used for other analysis
df_drop = df_drop.drop(labels=drop,axis=1)
df_drop.sample(10)

Unnamed: 0,title,release_date,platforms,original_price,developer,publisher,overall_reviews,text_reviews,tags,ram,rating,language
33416,"MONSTER HUNTER RISE - ""Attack"" pose set","12 Jan, 2022",Windows,Rp 28 999,"CAPCOM Co., Ltd.","CAPCOM Co., Ltd.",1 user reviews,- Need more user reviews to generate a score,"Action,RPG,Co-op,Action RPG,JRPG",8 GB RAM,Rating for: PEGI,"English,French,Italian,German,Spanish - Spain,..."
48100,Merrily Perilly Soundtrack + Art Book,"23 Oct, 2018","Windows,Mac OS",Rp 17 499,Squiddershins,,,,Indie,34 MB RAM,,English
27535,Guardians of Greyrock,"2 Apr, 2021",Windows,Free to Play,MythicWare,Dark Illusions Entertainment LLC,Mixed,- 45% of the 20 user reviews for this game are...,"Strategy,Card Game,Adventure,Indie,Casual,Card...",8 GB RAM,,English
40941,Neptunia Virtual Stars - Game Club Project Pack,"29 Apr, 2021",Windows,Rp 32 999,"Idea Factory,Compile Heart",,,,"Action,Adventure,RPG",4 GB RAM,,"English,Japanese,Traditional Chinese"
17791,PD Howler 10,"6 May, 2016",Windows,Rp 82 999,"Daniel Ritchie - PDHowler.com,Philip Staiger -...",KPL,Positive,- 92% of the 14 user reviews for this software...,"Design & Illustration,Photo Editing,Animation ...",2 GB RAM,,English
11734,绽于枝垂樱下~Flowering Across the Hakugyokurou,"12 Jul, 2019",Windows,Rp 8 499,凝冰剑斩,凝冰剑斩,Very Positive,- 96% of the 112 user reviews for this game ar...,"RPG,Indie,RPGMaker,Female Protagonist,LGBTQ+,V...",512 MB RAM,,"English,Simplified Chinese"
19607,The foreigner,"5 Feb, 2020",Windows,Rp 8 499,HCGstudio,HCGstudio,Positive,- 86% of the 22 user reviews for this game are...,"Adventure,Indie,Casual,Nudity,Sexual Content,G...",2 GB RAM,,"English,Simplified Chinese,Traditional Chinese"
50030,Deeply Dark,"31 May, 2021",Windows,Rp 99 999,DALESI,DALESI,3 user reviews,- Need more user reviews to generate a score,"Early Access,Indie,Roguelike,Inventory Managem...",6 GB RAM,,"English,Spanish - Latin America"
26384,Luminos,"11 Apr, 2019",Windows,,dev4play,dev4play,Positive,- 100% of the 15 user reviews for this game ar...,"Indie,Casual",2 GB RAM,,"English,French,Italian,German,Spanish - Spain,..."
24412,Book of Shadows,"1 Nov, 2020",Windows,Rp 32 999,Enoops,Enoops,Mostly Positive,- 78% of the 33 user reviews for this game are...,"Adventure,Casual,Puzzle,Point & Click,Arcade,E...",2 GB RAM,,"English,Russian,French,German,Polish"


### data extraction and formatting

#### extract `month` and `year`

In [8]:
#define function to extract year
def extract_year(x):
    year = None
    if type(x) == list:
        for i in x:
            if len(i) == 4:
                year = i
                return year
            else: 
                year = None
        return year
    else:
        year = x
    return year

In [9]:
#df_drop[['date','month','year']] = df_drop['release_date'].str.split(' ', n=2 , expand=True)
df_extract = df_drop.copy()
month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
#df_extract['year'] = df_extract['release_date'].str[-4:].apply(pd.to_numeric, errors=('coerce')) 
df_extract['year'] = df_extract['release_date'].str.findall('(\d+)').apply(lambda x: extract_year(x)).astype(float)
df_extract['month'] = df_extract['release_date'].str.findall("[a-zA-Z]+").str[0]
df_extract['month'] = df_extract['month'].replace(['August','June'],['Aug','Jun'])
df_extract.loc[~df_extract['month'].isin(month),'month'] = None
df_extract = df_extract.drop('release_date', axis=1) #drop release date
df_extract.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,text_reviews,tags,ram,rating,language,year,month
38306,Jurisdiction,Windows,Rp 108 999,Meteor Shock,Meteor Shock,8 user reviews,- Need more user reviews to generate a score,"Strategy,Simulation,Management,RTS,Sandbox,Cri...",8 GB RAM,,English,2021.0,Mar
63774,Would you like to run an idol café?,"Windows,Linux",Rp 69 999,StarlightTree Games,Winged Cloud,Mixed,- 60% of the 15 user reviews for this game are...,"Visual Novel,Simulation,Anime,Cute,Dating Sim,...",250 MB RAM,,"English,Simplified Chinese",2021.0,Jun
66583,Say No! More,"Windows,Mac OS",Rp 95 999,Studio Fizbin,Thunderful Publishing,Very Positive,- 91% of the 399 user reviews for this game ar...,"Indie,Funny,Casual,Comedy,Singleplayer,Cute,Co...",4 GB RAM,,"English,French,Italian,German,Spanish - Spain,...",2021.0,Apr
53659,Aerofly FS 2 - Orbx - Eagle County Colorado,"Windows,Mac OS,Linux",Rp 139 999,Orbx,IPACS,5 user reviews,- Need more user reviews to generate a score,"Simulation,Indie",,,"English,German,French",2018.0,Jan
12202,DW8XLCE - OLD COSTUME PACK 3,Windows,Rp 25 999,"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",Positive,- 81% of the 11 user reviews for this game are...,Action,256 MB RAM,Rating for: PEGI,"English,French,German",2014.0,Aug


#### extract `user_rating` and `total_reviews`

In [10]:
df_extract_2 = df_extract.copy()
df_extract_2['text_reviews'] = df_extract_2['text_reviews'].str.replace(',','')
df_extract_2['user_rating'] = df_extract_2['text_reviews'].str.findall('(\d+)').str[0].astype(float)
df_extract_2['total_review'] = df_extract_2['text_reviews'].str.findall('(\d+)').str[1].astype(float)
df_extract_2 = df_extract_2.drop('text_reviews',axis=1)
df_extract_2.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,tags,ram,rating,language,year,month,user_rating,total_review
52717,BLOCK HEIST: Robbery Simulator,Windows,Rp 8 499,Cyber Gaming,Cyber Gaming,8 user reviews,"Action,Crime,Heist,Simulation,Third-Person Sho...",4 GB RAM,,"English,Russian,French,Italian,German,Spanish ...",2021.0,Dec,,
15909,DELUGE,Windows,Rp 39 999,Riggy2k3,Riggy2k3,Positive,"RPG,Emotional,Old School,Dark Fantasy,Pixel Gr...",256 MB RAM,,English,2021.0,Nov,100.0,15.0
16685,Dungeon Defenders - Etherian Holiday Extravaganza,"Windows,Mac OS,Linux",Rp 17 499,Trendy Entertainment,Trendy Entertainment,Mixed,"Indie,Action,RPG",,Rating for: PEGI,English,2011.0,Dec,68.0,16.0
7155,奇幻与砍杀2 Fantasy & Blade Ⅱ,Windows,Rp 59 999,绝汪,绝汪,Mostly Positive,"Action,Casual,Adventure,RPG,Indie,Strategy,RPG...",4 GB RAM,,"English,Simplified Chinese",2019.0,Oct,74.0,109.0
7539,Tower! 3D,Windows,Rp 64 999,FeelThere,FeelThere,Mixed,"Simulation,Flight,Multiplayer,Management,Casua...",8 GB RAM,,English,2020.0,Jul,56.0,50.0


#### extract `supported_language`

In [11]:
df_extract_3 = df_extract_2.copy()
df_extract_3['supported_language'] = df_extract_3['language'].str.split(',').apply(lambda x: len(x) if type(x) == list else x)
#df_extract_3['english_language'] = df_extract_3['language'].str.lower().str.contains('english')
df_extract_3 = df_extract_3.drop('language',axis=1)
df_extract_3.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,tags,ram,rating,year,month,user_rating,total_review,supported_language
17541,Under Domain - Alien Invasion Simulator,Windows,Rp 48 999,Playlearn,Playlearn,Mixed,"Board Game,Aliens,Villain Protagonist,Time Man...",4 GB RAM,,2020.0,Oct,50.0,69.0,2.0
38747,Toasted!,Windows,Rp 64 999,Polygon Dust Entertainment Ltd.,Polygon Dust Entertainment Ltd.,,"Local Co-Op,Action,Vehicular Combat,Party,Co-o...",8 GB RAM,,2022.0,Mar,,,1.0
43865,Pride of Nations: American Civil War 1862,Windows,Rp 35 999,Ageod,Slitherine Ltd.,1 user reviews,Strategy,2 GB (,,2011.0,Aug,,,4.0
52932,Kaboom!,"Windows,Linux",Rp 52 999,CubeCoders Limited,CubeCoders Limited,1 user reviews,"Action,Casual,Arcade,Multiplayer,Battle Royale...",4096 MB RAM,,2021.0,Jul,,,1.0
23798,Ride 2 Kawasaki and Ducati Bonus Pack,Windows,Rp 32 999,Milestone S.r.l.,Milestone S.r.l.,Mixed,"Simulation,Racing,Sports",4 GB RAM,,2016.0,Oct,64.0,17.0,6.0


#### extract `price`

In [12]:
df_price = df_extract_3.copy()
df_price['price'] = df_price['original_price'].str.lower().str.strip('rp ').str.replace(' ','').astype(int,errors='ignore')
df_price.loc[df_price['price'].str.contains('free|demo|season|third|now',na=False),'price'] = 0
df_price['price'] = df_price['price'].astype(float)
df_price = df_price.drop('original_price',axis=1)
df_price.sample(5)

Unnamed: 0,title,platforms,developer,publisher,overall_reviews,tags,ram,rating,year,month,user_rating,total_review,supported_language,price
23557,Megalith,Windows,"Disruptive Games, Inc.","Disruptive Games, Inc.",Mixed,"Action,VR,MOBA,Multiplayer,Hero Shooter,Tactic...",8 GB RAM,,2020.0,May,65.0,49.0,1.0,119999.0
53049,Neptunia Virtual Stars - Unlock All BeatTik Da...,Windows,"Idea Factory,Compile Heart",Idea Factory International,,"Action,Adventure,RPG",4 GB RAM,,2021.0,Mar,,,3.0,17499.0
15507,Townopolis,"Windows,Mac OS,Linux",Lonely Troops,Lonely Troops,Mostly Positive,"Casual,Simulation,Strategy,Indie,Time Manageme...",1 GB RAM,,2016.0,Apr,75.0,52.0,6.0,32999.0
38849,White Day - Christmas Costume - Ji-Hyeon Seol,Windows,SONNORI Corp,"PQube,SONNORI Corp",4 user reviews,"Action,Adventure,Indie",4 GB RAM,,2017.0,Dec,,,9.0,17499.0
63790,Zero-K,Windows,Zero-K Team,Zero-K Team,Very Positive,"Strategy,Free to Play,RTS,Indie,Simulation,Mul...",4 GB RAM,,2018.0,Apr,82.0,34.0,1.0,0.0


#### extract `rated_by_pegi`

In [13]:
df_pegi = df_price.copy()
df_pegi['pegi_rated'] = np.where(df_pegi['rating']=='Rating for: PEGI', 1, 0)
df_pegi = df_pegi.drop('rating',axis=1)
df_pegi.sample(5)

Unnamed: 0,title,platforms,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated
66618,Borderlands 3: Multiverse Disciples of the Vau...,Windows,Gearbox Software,2K,Mixed,"Action,RPG,Gore,Violent,Sexual Content",6 GB RAM,2021.0,Apr,46.0,15.0,11.0,150000.0,1
63978,Sentience: The Android's Tale,"Windows,Mac OS",Pilgrim Adventures,GrabTheGames,Very Positive,"Adventure,Indie,Sci-fi,Multiple Endings,Futuri...",2 GB RAM,2017.0,Jun,89.0,76.0,1.0,31000.0,0
64567,Neon Chrome,"Windows,Mac OS,Linux",10tons Ltd,10tons Ltd,Very Positive,"Cyberpunk,Twin Stick Shooter,Action,Action Rog...",2048 MB RAM,2016.0,Apr,94.0,474.0,6.0,115999.0,1
55880,Trainz 2019 DLC: Avery - Drexel Route,"Windows,Mac OS",N3V Games,,2 user reviews,Simulation,4 GB RAM,2019.0,Jan,,,6.0,139999.0,0
22979,Becalm,"Windows,Mac OS,Linux",Colorfiction,Colorfiction,Very Positive,"Free to Play,Casual,Indie,Relaxing,Nature,Beau...",2 GB RAM,2019.0,Jan,84.0,332.0,29.0,0.0,0


extract `windows`, `mac`, and `linux`

In [14]:
df_platform = df_pegi.copy()
df_platform['windows'] = np.where(df_platform['platforms'].str.contains('Windows',na=False), 1, 0)
df_platform['mac'] = np.where(df_platform['platforms'].str.contains('Mac OS',na=False), 1, 0)
df_platform['linux'] = np.where(df_platform['platforms'].str.contains('Linux',na=False), 1, 0)
df_platform['VR'] = np.where(df_platform['platforms'].str.contains('VR Supported',na=False), 1, 0)
df_platform = df_platform.drop('platforms',axis=1)
df_platform.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
53341,Rocksmith® 2014 – Jimi Hendrix - “Foxey Lady”,Ubisoft - San Francisco,,1 user reviews,"Casual,Simulation",2 GB RAM,2014.0,Dec,,,6.0,35000.0,1,1,1,0,0
15502,Complete the set,,,,,,,,,,,247198.0,0,1,0,0,0
50246,Riddles of the Owls Kingdom - Soundtrack,DigiMight,,,"Indie,Casual,Free to Play",512 MB RAM,2018.0,Oct,,,4.0,24999.0,0,1,1,0,0
38180,POG 7,Cute Hannah's Games,Cute Hannah's Games,1 user reviews,"Casual,Side Scroller,2D,Cute,Family Friendly,I...",2 GB RAM,2022.0,Feb,,,29.0,17499.0,0,1,0,0,0
63765,Pixel Shopkeeper,"Pixel Prototype, LLC","Pixel Prototype, LLC",Mostly Positive,"Simulation,Indie,Pixel Graphics,Strategy,Adven...",1 GB RAM,2017.0,Jul,78.0,140.0,1.0,52999.0,0,1,1,1,0
48924,Zephyr,New World Computing,Ziggurat,1 user reviews,"Combat Racing,FPS,Pixel Graphics,Futuristic,Dy...",256 MB RAM,2021.0,Feb,,,1.0,52999.0,0,1,1,0,0
16168,Dashy Square,"Kastriot Sulejmani,Logan Gerrol",KasSanity,Very Positive,"Indie,Action,Music,Difficult,Great Soundtrack,...",512 MB RAM,2016.0,Apr,84.0,109.0,9.0,24999.0,1,1,1,0,0
51730,The Banner Saga 3 - Eternal Arena,Stoic,Versus Evil,5 user reviews,"Strategy,RPG,Indie",2 GB RAM,2018.0,Dec,,,10.0,33939.0,0,1,1,0,0
57828,Infinite World: Randomize everything,Vik,VicX Studio,Mixed,"Adventure,Indie,RPG,Action,Survival,Open World...",4 GB RAM,2018.0,Jun,42.0,54.0,1.0,8499.0,0,1,0,0,0
49221,Dimension Shifter,"Eyexapp Play,Eyexapp Creative",Eyexapp,1 user reviews,"Action-Adventure,Adventure,2D Platformer,Pixel...",1 GB RAM,2022.0,Jan,,,1.0,39999.0,0,1,0,0,0


#### remap `ram`

In [15]:
df_platform['ram'][0]

' 2 GB RAM'

In [16]:
df_ram = df_platform.copy()
df_ram.loc[df_ram['ram'].str.lower().str.contains('16.0 gb', na=False),'ram'] = '16 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('12.00 gb', na=False),'ram'] = '12 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('10 gb', na=False),'ram'] = '10 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('9 gb|9 gb', na=False),'ram'] = '9 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('8 gb|8000 mb|８ gb|8gb|8.0 gb|8+', na=False),'ram'] = '8 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('7 gb', na=False),'ram'] = '7 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('6 gb|6000 mb|６ gb', na=False),'ram'] = '6 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('5 gb|5000 mb', na=False),'ram'] = '5 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('4 gb|4096 mb|4000 mb|4gb|４ gb|4.00 gb|4g|4 ram gb', na=False),'ram'] = '4 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('3 gb|3000 mb|3 go mb|3gb', na=False),'ram'] = '3 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('2 gb|2048|2000 mb|2gb|1536 mb|2.0 gb|2000 gb|２ gb|1.5gb|2.0gb|2g', na=False),'ram'] = '2 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('1 gb|1024|1000 mb|1gb|800 mb|750 mb|1280 mb|700 mb|900 mb|１ gb|1+', na=False),'ram'] = '1 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('512|500 mb|400 mb|512mb|600 mb|597 mb|320 mb|420 mb|500mb|349 mb', na=False),'ram'] = '512 MB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('256 gb|200 mb|300 mb|250 mb|256mb|220 mb|256 ram', na=False),'ram'] = '256 MB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('128 mb|64m|70 mb|100 mb|150 mb|16 mb|8 mb|2 mb|1 mb|4 mb|50 mb|3 mb|6 mb|10 mb|128mb|80 mb|4 ram mb|30 mb|40 mb|25 mb|5 mb|120 mb|64 mb|90 mb|60 mb|95 mb|65 mb|64mb|16mb|100以上|32mb|97 mb|20 mb|99 mb|4mb|69 mb|59 mb', na=False),'ram'] = '<128 MB'
df_ram.loc[~df_ram['ram'].str.lower().str.contains('16 gb|12 gb|10 gb|9 gb|8 gb|7 gb|6 gb|5 gb|4 gb|3 gb|2 gb|1 gb|512 mb|256 mb|<128 mb', na=False),'ram'] = 'Unknown'
df_ram['ram'].value_counts()

4 GB       13515
1 GB       13299
2 GB       12720
Unknown    11840
8 GB        7941
<128 MB     2853
6 GB        1946
3 GB         658
5 GB         109
7 GB           4
9 GB           2
Name: ram, dtype: int64

In [17]:
df_ram.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
8460,IRON GUARD VR,Xlab Digital,Xlab Digital,Very Positive,"VR,Tower Defense,Strategy,RTS,Sci-fi,Action,Ro...",4 GB,2021.0,Sep,96.0,50.0,5.0,95999.0,0,1,0,0,0
48557,Pac vs Ghosts,Pix Arts,Pix Arts,2 user reviews,"Action,Casual,Arcade,Classic,Cartoon,3D Platfo...",Unknown,2020.0,Jul,,,1.0,24999.0,0,1,1,0,0
38613,Fate/EXTELLA - Cheerleader,Marvelous Inc.,"XSEED Games,Marvelous USA, Inc.",1 user reviews,Action,4 GB,2017.0,Jul,,,4.0,18999.0,0,1,0,0,0
40347,Fantasy Memory Card Game,Boogygames Studios,Boogygames Studios,1 user reviews,"Card Game,Board Game,Puzzle,Point & Click,Matc...",1 GB,2020.0,Mar,,,1.0,8499.0,0,1,0,0,0
42468,BLUE REFLECTION - Arland Maid Costumes (Lime),"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",,RPG,4 GB,2017.0,Sep,,,3.0,55999.0,1,1,0,0,0
9954,House Sketcher 3D,Sebastian Kemper,Sebastian Kemper,Mixed,"Simulation,Design & Illustration,Building,Soft...",4 GB,2021.0,Dec,46.0,13.0,27.0,69999.0,0,1,1,0,0
51691,Pac-Man Championship Edition DX+: Big Eater Co...,"Mine Loader Software Co., Ltd.",BANDAI NAMCO Entertainment,9 user reviews,Action,1 GB,2013.0,Sep,,,5.0,20000.0,1,1,0,0,0
36035,Guts and Glory - Original Soundtrack,HakJak,tinyBuild,5 user reviews,"Action,Indie,Casual,Racing,Gore,Violent,Soundt...",Unknown,2018.0,Jul,,,,39999.0,0,0,0,0,0
5426,A Street Cat's Tale,feemodev,"CFK Co., Ltd.",Very Positive,"Indie,Simulation,Adventure,Cats,Pixel Graphics...",2 GB,2019.0,Sep,96.0,162.0,4.0,39999.0,0,1,1,0,0
7981,Raptor: Call of the Shadows (1994 Classic Edit...,Cygnus Studios,Apogee Entertainment,Very Positive,"Action,Shoot 'Em Up,Classic,2D,Old School,1990's",2 GB,2014.0,May,93.0,101.0,1.0,45999.0,0,1,1,0,0


#### remap `overall_reviews`

In [18]:
df_or = df_ram.copy()
df_or.loc[~df_or['overall_reviews'].str.contains('Positive|Negative|Very|Overwhelmingly|Mixed|Mostly', na=False),'overall_reviews'] = '<10 reviews'
df_or['overall_reviews'].value_counts()

<10 reviews                26699
Very Positive              10628
Mixed                       9458
Positive                    9079
Mostly Positive             6749
Mostly Negative             1559
Overwhelmingly Positive      417
Negative                     234
Very Negative                 57
Overwhelmingly Negative        7
Name: overall_reviews, dtype: int64

In [19]:
df_or.sample(5)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
13820,Beach Buggy Racing 2: Hot Wheels™ Edition,,,<10 reviews,,Unknown,,,,,,161098.0,0,1,0,0,0
37290,LET IT DIE -(7 Mil Downloads)100 Death Metals-,"GRASSHOPPER MANUFACTURE INC.,SUPERTRICK GAMES...",GungHo Online Entertainment America,<10 reviews,"Action,Free to Play,Gore,Violent,Nudity,Sexual...",8 GB,2021.0,Sep,,,10.0,119999.0,1,1,0,0,0
12665,Ragnar's Chinese Memory Game,Ragnar Brynjúlfsson,Ragnar Brynjúlfsson,Positive,"Education,Card Game,Board Game,Hand-drawn,Fore...",4 GB,2020.0,Aug,100.0,10.0,3.0,48999.0,0,1,0,1,0
42842,"灵魂筹码 - 幽伶刀马花旦套装 Soul at Stake - ""Tao Ma Tan"" T...",Chongming Studio,Chongming Studio,<10 reviews,"Action,RPG,Indie,Gore,Violent,Multiplayer,Surv...",8 GB,2020.0,Jun,,,2.0,169999.0,0,1,0,0,0
19005,Gabe Newell Simulator,Antonio Renna,Antonio Renna,Mixed,"Memes,Action,Simulation,Indie,Adventure,Parody...",2 GB,2015.0,Dec,49.0,461.0,1.0,19999.0,0,1,1,1,0


#### reformat and remap `tags`

In [20]:
df_tags = df_or.copy()
df_tags.loc[df_tags['tags'].isna()==True,'tags'] = 'No Tags'
df_tags['tags'] = df_tags['tags'].str.split(',')
#df_tags['tags'] = df_tags['tags'].replace(',', '","')
#df_tags['tags'] = '["' + df_tags['tags'] + '"]'
df_tags.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
10768,Injustice™ 2 - Black Manta,"NetherRealm Studios,QLOC",WB Games,Positive,[Action],4 GB,2017.0,Nov,93.0,15.0,8.0,48999.0,1,1,0,0,0
28716,Catsby,OOTOTGames,OOTOTGames,Mixed,"[Adventure, Indie, Retro, Platformer]",1 GB,2017.0,Apr,68.0,22.0,1.0,45999.0,0,1,1,0,0
28069,Archamon,Vionsoft,Vionsoft,Mixed,"[Strategy, Indie, City Builder]",4 GB,2017.0,Nov,66.0,24.0,2.0,82999.0,0,1,0,0,0
21558,Cloudpunk + Soundtrack Bundle,,,<10 reviews,[No Tags],Unknown,,,,,,161098.0,0,1,0,0,0
30148,Data Bundle,,,<10 reviews,[No Tags],Unknown,,,,,,174990.0,0,1,0,0,0
48777,Xtreme Typing,Individual Software,Individual Software,<10 reviews,"[Casual, Education, Software Training, Relaxin...",Unknown,2020.0,Aug,,,1.0,32999.0,0,1,0,0,0
33266,"MONSTER HUNTER RISE - ""Canyne Tail"" Hunter lay...","CAPCOM Co., Ltd.","CAPCOM Co., Ltd.",<10 reviews,"[Action, RPG, Co-op, Action RPG, JRPG]",8 GB,2022.0,Jan,,,13.0,21999.0,1,1,0,0,0
20614,Animal and Aquarist,,,<10 reviews,[No Tags],Unknown,,,,,,161098.0,0,1,0,0,0
59525,Ho-Ho-Home Invasion,Whitepot Studios,Whitepot Studios,Very Positive,"[Free to Play, Stealth, Indie, Action, Short, ...",4 GB,2020.0,Dec,95.0,85.0,6.0,0.0,0,1,0,0,0
31118,Gangsta bundle,,,<10 reviews,[No Tags],Unknown,,,,,,185994.0,0,1,0,0,0


In [21]:
def to_1D(series): #
 return pd.Series([x for _list in series for x in _list])

#### reformat and remap `developer`

In [22]:
df_dev = df_tags.copy()
df_dev.loc[df_dev['developer'].isna()==True,'developer'] = 'Unknown'
df_dev['developer'] = df_dev['developer'].str.split(',')
#df_tags['tags'] = df_tags['tags'].replace(',', '","')
#df_tags['tags'] = '["' + df_tags['tags'] + '"]'
df_dev.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
47531,Fateholder Quest,[Brewsterland Studios],Brewsterland Studios,<10 reviews,"[RPG, Party-Based RPG, JRPG, 3D, Comedy, Fanta...",2 GB,2020.0,Aug,,,1.0,17499.0,0,1,0,0,0
43440,2033: Das Erschwachen der Macht,[LOT],Zweitausendeins GmbH & Co. KG,<10 reviews,"[Adventure, Interactive Fiction, Visual Novel,...",4 GB,2021.0,Sep,,,2.0,82999.0,0,1,0,0,0
33324,RPG Maker MZ - FES Resource Pack,"[Gotcha Gotcha Games, KADOKAWA]",Degica,<10 reviews,"[RPG, Web Publishing, Design & Illustration]",Unknown,2020.0,Sep,,,2.0,139999.0,0,1,1,0,0
65122,Devil May Cry 5 - Sweet Surrender,"[CAPCOM Co., Ltd.]","CAPCOM Co., Ltd.",Positive,"[Action, Nudity, Violent]",8 GB,2019.0,Mar,92.0,27.0,12.0,29999.0,1,1,0,0,0
42173,Bounty Hunters,[WhiteSquare Game Studio],WhiteSquare Game Studio,<10 reviews,"[Action, Shooter, Arena Shooter, Third-Person ...",8 GB,2022.0,Jan,,,1.0,39999.0,0,1,1,0,0
51352,New Supper Banana!,[Pikku-a],Pikku-a,<10 reviews,"[Adventure, Platformer, Funny, Indie, Hand-dra...",1 GB,2021.0,Dec,,,3.0,32999.0,0,1,0,0,0
49212,Koi Solitaire,[Puzzle Lab],Dikobraz Games,<10 reviews,"[Casual, Indie]",1 GB,2018.0,Sep,,,1.0,59999.0,0,1,0,0,0
50411,RPG Maker MV - Zonderland,[Intelligentsia],Degica,<10 reviews,"[Design & Illustration, Web Publishing]",<128 MB,2018.0,Jul,,,12.0,59999.0,0,1,1,1,0
15114,Painters Guild,[Lucas Molina],Lucas Molina,Mixed,"[Simulation, Indie, Casual, Strategy, Manageme...",1 GB,2015.0,Sep,64.0,310.0,1.0,89999.0,0,1,0,0,0
17011,Robert: Space Stories and Battles,[Dnovel],Dnovel,Positive,"[Casual, Adventure, RPG, Arcade, Visual Novel,...",1 GB,2022.0,Mar,100.0,14.0,1.0,17499.0,0,1,0,0,0


In [23]:
to_1D(df_dev['developer']).value_counts().head(100)

Unknown                    5214
Ubisoft - San Francisco    1166
 LTD.                      1137
 LLC                       1108
KOEI TECMO GAMES CO.        974
                           ... 
TK.Projects                  50
Techland                     50
Orange_Juice                 49
AMAX Interactive             49
Harmonix Music Systems       49
Length: 100, dtype: int64

#### reformat and remap `publisher`

In [24]:
df_pub = df_dev.copy()
df_pub.loc[df_pub['publisher'].isna()==True,'publisher'] = 'Unknown'
df_pub['publisher'] = df_pub['publisher'].str.split(',')
df_pub.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
68208,Ironsight,"[WipleGames Inc., ]","[WipleGames Inc., ]",Mixed,"[Free to Play, Action, FPS, Multiplayer, Shoot...",6 GB,2019.0,Jun,68.0,566.0,14.0,0.0,0,1,0,0,0
37394,RTK13WPK - Watercolor Painting Style Officer C...,"[KOEI TECMO GAMES CO., LTD.]","[KOEI TECMO GAMES CO., LTD.]",<10 reviews,[Simulation],1 GB,2017.0,Jun,,,5.0,35999.0,1,1,0,0,0
47994,Rocksmith® 2014 Edition – Remastered – Raspber...,[Ubisoft - San Francisco],[Unknown],<10 reviews,"[Casual, Simulation]",2 GB,2017.0,May,,,6.0,45000.0,1,1,1,0,0
22141,Gun Witch,[AGM Studios],[AGM Studios],Very Positive,"[Adventure, RPG, Platformer, Action RPG, Shoot...",1 GB,2020.0,Dec,91.0,213.0,1.0,0.0,0,1,0,0,0
58579,Grotesque Tactics: Evil Heroes,[Silent Dreams],[Headup],Mixed,"[RPG, Strategy, Indie, Turn-Based, Parody , Ta...",1 GB,2010.0,Oct,60.0,285.0,2.0,89999.0,0,1,0,0,0
34056,Fimbul Winter,[Success Games],[Success Games],Mostly Negative,"[Action, Adventure, Simulation, Strategy, Sing...",2 GB,2021.0,May,20.0,10.0,5.0,8499.0,0,1,0,0,0
32221,DUMB Infernal,[Beem Media],[Beem Media],Mixed,"[Action, Indie, FPS, Arcade, Pixel Graphics]",Unknown,2020.0,Jan,65.0,23.0,1.0,0.0,0,1,0,0,0
3692,Higurashi When They Cry Hou - Ch.6 Tsumihoroboshi,[07th Expansion],[MangaGamer],Very Positive,"[Visual Novel, Adventure, Horror, Anime, Myste...",1 GB,2018.0,Jun,98.0,334.0,2.0,59999.0,0,1,1,1,0
18224,Race Race Racer,[Enaayah Software Development and Services Pri...,[Enaayah Software Development and Services Pri...,Positive,"[Racing, Casual, Relaxing, Comedy, Great Sound...",2 GB,2019.0,Dec,84.0,26.0,1.0,8499.0,0,1,1,1,0
395,ARK: Extinction - Expansion Pack,"[Studio Wildcard, Instinct Games, Efecto Studi...","[Snail Games USA, Studio Wildcard]",Very Positive,"[Adventure, Action, Massively Multiplayer, Sur...",8 GB,2018.0,Nov,86.0,15.0,21.0,,1,1,1,1,0


In [25]:
to_1D(df_pub['publisher']).value_counts().head(60)

Unknown                       10180
Degica                         1078
 LTD.                           906
 Inc.                           862
KOEI TECMO GAMES CO.            788
 Ltd.                           678
Dovetail Games - Trains         514
CAPCOM Co.                      466
Paradox Interactive             425
Big Fish Games                  380
BANDAI NAMCO Entertainment      380
SEGA                            379
Square Enix                     375
Electronic Arts                 325
D3 PUBLISHER                    324
Ubisoft                         319
 LLC                            308
XSEED Games                     289
Marvelous USA                   283
THQ Nordic                      281
Feral Interactive (Mac)         269
2K                              228
Dovetail Games - Flight         228
Slitherine Ltd.                 224
Idea Factory International      212
Feral Interactive (Linux)       204
8floor                          197
NIS America                 

## data understanding

In [26]:
df_final = df_pub.copy()
df_final.sample(5)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,ram,year,month,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
55850,Australian Football Coach,[Statto Software],[Statto Software],Mostly Negative,"[Strategy, Indie, Simulation, Sports, Football...",1 GB,2017.0,Sep,30.0,10.0,1.0,108999.0,0,1,0,0,0
3785,Geometry Arena,[011 Games],[011 Games],Very Positive,"[Roguelite, Bullet Hell, Minimalist, Replay Va...",8 GB,2021.0,Jun,96.0,26.0,2.0,48999.0,0,1,1,0,0
1259,Lethal League Blaze,[Team Reptile],[Team Reptile],Very Positive,"[Action, Indie, Great Soundtrack, 2D Fighter, ...",4 GB,2018.0,Oct,100.0,32.0,10.0,108999.0,0,1,1,1,0
6157,Mutropolis,[Pirita Studio],[Application Systems Heidelberg],Very Positive,"[Indie, Detective, Adventure, Point & Click, M...",4 GB,2021.0,Feb,91.0,86.0,10.0,108999.0,0,1,1,1,0
19099,100 nya,[.M.Y.W.],[.M.Y.W.],Positive,"[Indie, Strategy, Puzzle]",1 GB,2016.0,Nov,91.0,23.0,19.0,69999.0,0,1,0,0,0


In [28]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64887 entries, 0 to 68467
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               64887 non-null  object 
 1   developer           64887 non-null  object 
 2   publisher           64887 non-null  object 
 3   overall_reviews     64887 non-null  object 
 4   tags                64887 non-null  object 
 5   ram                 64887 non-null  object 
 6   year                59907 non-null  float64
 7   month               59903 non-null  object 
 8   user_rating         38188 non-null  float64
 9   total_review        38188 non-null  float64
 10  supported_language  57750 non-null  float64
 11  price               64366 non-null  float64
 12  pegi_rated          64887 non-null  int32  
 13  windows             64887 non-null  int32  
 14  mac                 64887 non-null  int32  
 15  linux               64887 non-null  int32  
 16  VR  

In [29]:
df_final.describe()

Unnamed: 0,year,user_rating,total_review,supported_language,price,pegi_rated,windows,mac,linux,VR
count,59907.0,38188.0,38188.0,57750.0,64366.0,64887.0,64887.0,64887.0,64887.0,64887.0
mean,2018.223396,76.416963,218.957971,4.77413,83633.83,0.201181,0.965232,0.283632,0.167707,0.009493
std,2.77341,18.492328,4440.137845,5.349063,158817.4,0.400886,0.183193,0.450764,0.373609,0.096971
min,1997.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,66.0,18.0,1.0,24999.0,0.0,1.0,0.0,0.0,0.0
50%,2019.0,80.0,40.0,2.0,52999.0,0.0,1.0,0.0,0.0,0.0
75%,2020.0,91.0,122.0,7.0,95999.0,0.0,1.0,1.0,0.0,0.0
max,2077.0,100.0,558563.0,29.0,8990155.0,1.0,1.0,1.0,1.0,1.0


In [35]:
df_final[['overall_reviews','ram']].describe()

Unnamed: 0,overall_reviews,ram
count,64887,64887
unique,10,11
top,<10 reviews,4 GB
freq,26699,13515


## export csv

In [36]:
df_final.to_csv('for_EDA.csv', index=False)