## import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

## load dataset

In [2]:
raw = pd.read_csv('steam_games.csv')
raw.sample(5)

Unnamed: 0,title,url,image,release_date,platforms,discount_rate,original_price,discounted_price,developer,publisher,...,whole_reviews,description,tags,genre,processor,ram,graphic_card,rating,language,metacriticts
39147,The Legend of Heroes: Trails of Cold Steel III...,https://store.steampowered.com/app/1262419/The...,https://cdn.akamai.steamstatic.com/steam/apps/...,"23 Mar, 2020",Windows,,Rp 17 499,,"Nihon Falcom,Engine Software BV,PH3 GmbH","NIS America, Inc.",...,,,RPG,RPG,,,,,"English,French,Japanese",
70141,Revived Souls,https://store.steampowered.com/app/1653460/Rev...,https://cdn.akamai.steamstatic.com/steam/apps/...,Feb 2024,"Windows,Mac OS",,,,Destinate Games,Destinate Games,...,,Explore a Celtic world full of magic devices a...,"Adventure,Puzzle,Story Rich,Philosophical,Medi...","Adventure,Indie,RPG",2 Ghz,2 GB RAM,"256 mb video memory, shader model 3.0+",,English,
109678,NASCAR '15 2016 Season Update,https://store.steampowered.com/app/515300/NASC...,https://cdn.akamai.steamstatic.com/steam/apps/...,"29 Aug, 2016",Windows,,Free,,Eutechnyx,Dusenberry Martin Racing,...,,,Racing,Racing,AMD Athlon 64 X2 6000 (2*3000 MHz),2048 MB RAM,GeForce 8800 GT,,English,
103913,Pirate Island Mini Golf VR,https://store.steampowered.com/app/1256870/Pir...,https://cdn.akamai.steamstatic.com/steam/apps/...,"8 Jun, 2020",Windows,,Rp 59 999,,Serious Tangents,Serious Tangents,...,,Pirate Island Mini Golf VR is a single or mult...,"Early Access,Casual,Sports,VR,Mini Golf,Atmosp...","Casual,Sports,Early Access","Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz, 3601...",16 GB RAM,NVIDIA GeForce GTX 1070,,English,
31170,In Search of the Most Dangerous Town on the In...,https://store.steampowered.com/app/380890/In_S...,https://cdn.akamai.steamstatic.com/steam/apps/...,"18 Jun, 2015",,,Free,,,,...,,"Râmnicu Vâlcea, Romania has only 120,000 resid...","Hacking,Documentary",,Intel Core 2 or AMD equivalent,1 GB RAM,,,English,


## data understanding

In [3]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121904 entries, 0 to 121903
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   title             121893 non-null  object 
 1   url               121904 non-null  object 
 2   image             121904 non-null  object 
 3   release_date      113850 non-null  object 
 4   platforms         117576 non-null  object 
 5   discount_rate     2941 non-null    object 
 6   original_price    109675 non-null  object 
 7   discounted_price  2935 non-null    object 
 8   developer         113921 non-null  object 
 9   publisher         103890 non-null  object 
 10  overall_reviews   78125 non-null   object 
 11  recent_reviews    78125 non-null   object 
 12  whole_reviews     5773 non-null    object 
 13  description       75925 non-null   object 
 14  tags              112020 non-null  object 
 15  genre             111334 non-null  object 
 16  processor         10

Observations:
- `url`,`image`,`discount_rate`,`discounted_price`,`description` drop
- `release_date` extract month and year
- `platfoms`,`developer`,`publisher`,`tags` split
- `original_price` change data type (extract price)
- `text_reviews` extract rating (`user_rating`) and total user reviews (`total_reviews`)
- `language` extract number of language (`language`)
- `metacriticts` too much missing values, drop

In [4]:
raw.describe()

Unnamed: 0,metacriticts
count,2223.0
mean,80.931174
std,4.404322
min,75.0
25%,77.0
50%,80.0
75%,84.0
max,97.0


In [5]:
raw.describe(include='object')

Unnamed: 0,title,url,image,release_date,platforms,discount_rate,original_price,discounted_price,developer,publisher,...,recent_reviews,whole_reviews,description,tags,genre,processor,ram,graphic_card,rating,language
count,121893,121904,121904,113850,117576,2941,109675,2935,113921,103890,...,78125,5773,75925,112020,111334,100264,99696,93894,18891,110255
unique,103975,121904,104388,5956,13,84,5094,1023,40378,34416,...,13798,4995,60557,60019,1846,17369,1006,20152,9,9687
top,Game + Soundtrack,https://store.steampowered.com/app/1172470/Ape...,https://cdn.akamai.steamstatic.com/steam/apps/...,2022,Windows,-50%,Rp 8 499,Rp 4 249,"SmiteWorks USA, LLC",TigerQiuQiu,...,- Need more user reviews to generate a score,- 97% of the 182 user reviews for this game ar...,Find the objects that are hidden on the map.,Action,"Action,Indie",1.6 GHz or higher processor,4 GB RAM,Graphics card recommended,https://store.akamai.steamstatic.com/public/sh...,English
freq,22,1,6,1194,83499,545,9675,200,2375,2132,...,32928,5,20,2414,5825,2302,26055,2303,5836,52491


observations:
- `rating` data is wrong, consider drop or rescrape data
- `tags` too much unique value, consider to create new columns tag that has been aggregated

## data preprocessing (format)

### drop duplicates and uninformative columns

In [6]:
df_drop = raw.copy()
df_drop.drop_duplicates(subset='title',inplace=True)
df_drop['title'].duplicated().sum()

0

In [7]:
drop = ['url','image','discount_rate','discounted_price','description','metacriticts','processor','graphic_card'] #these variables unable to show any kind of valuable information that can be used for current analysis and machine learning model, consider to be used for other analysis
df_drop = df_drop.drop(labels=drop,axis=1)
df_drop.sample(10)

Unnamed: 0,title,release_date,platforms,original_price,developer,publisher,overall_reviews,recent_reviews,whole_reviews,tags,genre,ram,rating,language
63916,Animated Puzzles - Spooky Pack,"29 Oct, 2020","Windows,Mac OS,Linux",Rp 24 999,Mexond,Mexond,,,,"Indie,Casual","Casual,Indie",2 GB RAM,,"English,French,Italian,German,Spanish - Spain,..."
50814,Soviet Souls,"7 May, 2019","Windows,Mac OS,Linux",Rp 17 499,"Fang's Lab,DP Games,Flatcoon","Fang's Lab,DP Games,Flatcoon",6 user reviews,- Need more user reviews to generate a score,,"Indie,Action,Adventure,Platformer,2D,Pixel Gra...","Action,Adventure,Indie",1 GB RAM,,"English,Russian,Ukrainian"
24524,Eternal Fantasy,"18 Oct, 2018",Windows,Rp 69 999,"Circus,YAMAYURI GAMES",SakuraGame,Mixed,- 68% of the 29 user reviews for this game are...,,"Sexual Content,Nudity,Adventure,Indie,RPG,Visu...","Adventure,Indie,RPG",1 GB RAM,,"English,Japanese,Traditional Chinese"
15980,Range is HOT!,"9 Mar, 2021",Windows,Rp 139 999,Winter Bear Studio,Winter Bear Studio,Mostly Positive,- 77% of the 27 user reviews for this game are...,,"Simulation,Sports,VR,Competitive,Action,Shoote...","Action,Simulation,Sports,Early Access",8 GB RAM,,English
33174,Data Bundle for gifts,,Windows,Rp 174 990,,,,,,,,,,
748,Cities: Skylines - Campus,"21 May, 2019","Windows,Mac OS,Linux",Rp 109 199,Colossal Order Ltd.,Paradox Interactive,Mostly Positive,- 76% of the 214 user reviews for this game ar...,,"Simulation,Strategy,City Builder","Simulation,Strategy",8 GB RAM,https://store.akamai.steamstatic.com/public/sh...,"English,French,German,Spanish - Spain,Polish,P..."
37618,Rover Mechanic Simulator - Perseverance Rover DLC,"18 Feb, 2021",Windows,Rp 48 999,▲ Pyramid Games,Pyramid Games S.A.,5 user reviews,- Need more user reviews to generate a score,,"Indie,Simulation,Singleplayer,Replay Value,Fam...","Indie,Simulation",8 GB RAM,,"English,French,German,Simplified Chinese,Polis..."
44800,Zaccaria Pinball - House of Diamonds Table,"16 Jun, 2016","Windows,Mac OS,Linux",Rp 18 999,Magic Pixel Kft.,Magic Pixel Kft.,1 user reviews,- Need more user reviews to generate a score,,"Free to Play,Casual,Simulation,Sports","Casual,Free to Play,Simulation,Sports",2 GB RAM,,English
36680,Fantasy Grounds - 5E: Book of Lost Spells (Fif...,"18 Apr, 2016",Windows,Rp 135 999,"SmiteWorks USA, LLC",,2 user reviews,- Need more user reviews to generate a score,,"RPG,Indie,Strategy,Fantasy,Medieval,Party-Base...","Indie,RPG,Strategy",1 GB RAM,,English
49243,Fantasy Grounds - Starfinder Flip-Tiles - City...,"29 Mar, 2022","Windows,Mac OS",Rp 108 999,"SmiteWorks USA, LLC",,,,,"RPG,Indie,Strategy,Space,Sci-fi,Tabletop","Indie,RPG,Strategy",1 GB RAM,,English


### data extraction and formatting

#### extract `month` and `year`

In [8]:
#define function to extract year
def extract_year(x):
    year = None
    if type(x) == list:
        for i in x:
            if len(i) == 4:
                year = i
                return year
            else: 
                year = None
        return year
    else:
        year = x
    return year

In [9]:
#df_drop[['date','month','year']] = df_drop['release_date'].str.split(' ', n=2 , expand=True)
df_extract = df_drop.copy()
month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
#df_extract['year'] = df_extract['release_date'].str[-4:].apply(pd.to_numeric, errors=('coerce')) 
df_extract['year'] = df_extract['release_date'].str.findall('(\d+)').apply(lambda x: extract_year(x)).astype(float)
df_extract['month'] = df_extract['release_date'].str.findall("[a-zA-Z]+").str[0]
df_extract['month'] = df_extract['month'].replace(['August','June'],['Aug','Jun'])
df_extract.loc[~df_extract['month'].isin(month),'month'] = None
df_extract = df_extract.drop('release_date', axis=1) #drop release date
df_extract.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,recent_reviews,whole_reviews,tags,genre,ram,rating,language,year,month
104874,RPG Maker MZ - OverWorld Battlebacks,"Windows,Mac OS",Rp 95 999,Joel Steudler,Degica,,,,"RPG,Web Publishing,Design & Illustration","RPG,Design & Illustration,Web Publishing",,,"English,Japanese,French,Italian,German,Spanish...",2020.0,Oct
82260,The Great Race,Windows,Rp 69 999,Triumph LLC,Triumph LLC,7 user reviews,- Need more user reviews to generate a score,,"Racing,Action,Indie,Simulation,Adventure,Satir...","Action,Adventure,Indie,Racing,Simulation",4 GB RAM,https://store.akamai.steamstatic.com/public/sh...,"English,Russian",2018.0,May
87680,Knight And Mourning,"Windows,Mac OS",,1actose,1actose,,,,"Precision Platformer,Difficult,Platformer,2D P...","Action,Adventure,Casual,Indie",2 GB RAM,,English,2022.0,
96199,Cosmos Conquer Demo,Windows,Free Demo,Paweł Wiecha,Iguana Mercenary,,,,"Early Access,Casual,Strategy,Simulation,God Ga...","Casual,Indie,Simulation,Strategy,Early Access",2 GB RAM,,"English,Polish,Russian",2022.0,Jan
103844,"DFF NT: Laevateinn, Shantotto's 4th Weapon",Windows,Rp 12 000,"Square Enix,KOEI TECMO GAMES CO., LTD.",Square Enix,,,,"Action,Free to Play","Action,Free to Play",8 GB RAM,https://store.akamai.steamstatic.com/public/sh...,"English,French,Italian,German,Spanish - Spain,...",2019.0,Mar


#### extract `user_rating` and `total_reviews`

In [12]:
df_extract_2 = df_extract.copy()
df_extract_2['whole_reviews'] = df_extract_2['whole_reviews'].str.replace(',','')
df_extract_2['user_rating_all'] = df_extract_2['whole_reviews'].str.findall('(\d+)').str[0].astype(float)
df_extract_2['total_review_all'] = df_extract_2['whole_reviews'].str.findall('(\d+)').str[1].astype(float)
df_extract_2 = df_extract_2.drop('whole_reviews',axis=1)
df_extract_2.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,recent_reviews,tags,genre,ram,rating,language,year,month,user_rating_all,total_review_all
36904,The Last One,Windows,Rp 64 999,Phoenix Interactive Studio,Phoenix Interactive Studio,Mostly Negative,- 32% of the 115 user reviews for this game ar...,"Survival,Multiplayer,Zombies,Action,Adventure,...","Action,Adventure,Indie,Early Access",2 GB RAM,,"English,Turkish,French,German,Russian,Spanish ...",2018.0,Jan,,
55274,DOA6 Raidou Deluxe Costume,Windows,Rp 19 000,"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",1 user reviews,- Need more user reviews to generate a score,"Action,Gore,Violent,Nudity,Sexual Content",Action,8 GB RAM,https://store.akamai.steamstatic.com/public/sh...,"English,French,Italian,German,Spanish - Spain,...",2019.0,Aug,,
97034,Tiger Tank 59 Ⅰ Rainstorm MP060,Windows,Rp 8 499,TigerQiuQiu,TigerQiuQiu,,,"Action,Indie,Casual,Old School,Pixel Graphics,...","Action,Casual,Indie",4 GB RAM,,English,2021.0,Jul,,
91810,Without A Roof (W.A.R.),Windows,Rp 119 999,New World Coders,New World Coders,2 user reviews,- Need more user reviews to generate a score,"Action,Indie,Simulation,Adventure,RPG,Massivel...","Action,Adventure,Indie,Massively Multiplayer,R...",8 GB RAM,,English,2020.0,Jan,,
47746,Struggling Ball,Windows,Rp 39 999,Struggling Ball,Struggling Ball,3 user reviews,- Need more user reviews to generate a score,"Casual,Adventure,Action-Adventure,Runner,Parko...","Adventure,Casual,Indie",2 GB RAM,,"English,Simplified Chinese",2021.0,Jul,,


In [13]:
df_extract_2['recent_reviews'] = df_extract_2['recent_reviews'].str.replace(',','')
df_extract_2['user_rating_recent'] = df_extract_2['recent_reviews'].str.findall('(\d+)').str[0].astype(float)
df_extract_2['total_review_recent'] = df_extract_2['recent_reviews'].str.findall('(\d+)').str[1].astype(float)
df_extract_2 = df_extract_2.drop('recent_reviews',axis=1)
df_extract_2.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,tags,genre,ram,rating,language,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent
41066,Hunt-or-Haunt,Windows,Rp 39 999,DAONE GAMES,DAONE GAMES,4 user reviews,"Early Access,Multiplayer,Party Game,Indie,Casu...","Action,Casual,Indie,Strategy,Early Access",4 GB RAM,,"English,Simplified Chinese,Traditional Chinese",2022.0,Jan,,,,
30348,Soldier and Sapper,Windows,Rp 137 698,,,,,,,,,,,,,,
77491,Afterlife Empire Demo,"Windows,Mac OS",Free Demo,"Autobotika,Game Nation Inc",The Fine Young Capitalists,Mixed,"Casual,Indie,Strategy","Casual,Indie,Strategy",2 GB RAM,,English,2015.0,Aug,,,51.0,68.0
1959,Super Animal Royale Season 3 Starter Pack,"Windows,Mac OS",Rp 40 000,Pixile,Modus Games,Positive,"Action,Adventure,Free to Play,Massively Multip...","Action,Adventure,Casual,Free to Play,Indie,Mas...",3 GB RAM,https://store.akamai.steamstatic.com/public/sh...,"English,Japanese,Korean,Russian,Simplified Chi...",2022.0,Apr,,,100.0,34.0
28892,Metal Commando,Windows,Rp 8 499,indiegames3000,indiegames3000,Mixed,"Singleplayer,2D,Action,Adventure,Casual,Indie,...",Indie,1 GB RAM,,English,2020.0,Nov,,,68.0,41.0


#### extract `supported_language`

In [14]:
df_extract_3 = df_extract_2.copy()
df_extract_3['supported_language'] = df_extract_3['language'].str.split(',').apply(lambda x: len(x) if type(x) == list else x)
#df_extract_3['english_language'] = df_extract_3['language'].str.lower().str.contains('english')
df_extract_3 = df_extract_3.drop('language',axis=1)
df_extract_3.sample(5)

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,tags,genre,ram,rating,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language
114537,灵魂筹码 - 沉冤奈河 Soul at Stake - Drown in the Disgrace,Windows,Rp 22 999,Chongming Studio,Chongming Studio,Mixed,"RPG,Indie,Action,Gore,Violent","Action,Indie,RPG",8 GB RAM,,2019.0,Jun,,,60.0,15.0,2.0
51592,Super Minesweeper attACK,Windows,Rp 8 499,Triadne,Triadne,2 user reviews,"Indie,Casual","Casual,Indie",8 MB RAM,,2019.0,Sep,,,,,1.0
63997,Bug Blast,Windows,Free,Giga Grunts,DigiPen Institute of Technology,8 user reviews,"Top-Down Shooter,Action,Twin Stick Shooter,Rog...","Action,Free to Play",2 GB RAM,,2020.0,Jun,,,,,1.0
42708,Wizard Lady Soundtrack,,Rp 17 499,IR Studio,IR Studio,1 user reviews,,,,,2020.0,Apr,,,,,
38475,Edge of Twilight – Return To Glory,Windows,Rp 115 999,FUZZYEYES,FUZZYEYES,Mostly Negative,"Adventure,Action,Indie,Steampunk","Action,Adventure,Indie",4 GB RAM,,2016.0,Sep,,,37.0,29.0,9.0


#### extract `price`

In [26]:
df_extract_3[df_extract_3['original_price'].str.lower().str.contains('from')==True]

Unnamed: 0,title,platforms,original_price,developer,publisher,overall_reviews,tags,genre,ram,rating,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language
27972,Amnesia Fortnight,,From Rp 188 997,"2 Player Productions,Double Fine Productions",,Positive,"Gaming,Game Development,Documentary,Episodic",,1 GB RAM,,2017.0,May,,,80.0,10.0,1.0
95150,Hyperdimension Neptunia: The Animation,,From Rp 259 197,"IDEA FACTORY,COMPILE HEART",Idea Factory International,1 user reviews,"Adventure,Action,Episodic,Anime,Comedy,Cute,Nu...",,1 GB RAM,,2020.0,Sep,,,,,4.0
110048,PHOBIA,,From Rp 29 743,Open Sign Productions,Open Sign Films,Mostly Positive,"Nudity,Gore,Violent,Sexual Content,Horror,Epis...",,1 GB RAM,,2018.0,Feb,,,77.0,27.0,11.0


In [34]:
df_price = df_extract_3.copy()
df_price['price'] = df_price['original_price'].str.lower().str.replace(r'\D','').astype(int,errors='ignore')
df_price.loc[df_price['price']=='','price'] = 0
df_price['price'] = df_price['price'].astype(float)
df_price = df_price.drop('original_price',axis=1)
df_price.sample(5)

  df_price['price'] = df_price['original_price'].str.lower().str.replace(r'\D','').astype(int,errors='ignore')


Unnamed: 0,title,platforms,developer,publisher,overall_reviews,tags,genre,ram,rating,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language,price
111518,HEXAD,Windows,Keenmade,Keenmade,Positive,"Indie,Turn-Based Tactics,Party-Based RPG,Strat...","Free to Play,Indie,RPG,Strategy,Early Access",2 GB RAM,,2021.0,Dec,,,82.0,17.0,1.0,0.0
17558,Brave Survivor,Windows,Felio Stung ROG,kazakovstudios,Positive,"Action,Dungeon Crawler,Difficult,2D Fighter,Ro...","Action,Indie",1024 MB RAM,,2022.0,May,,,90.0,10.0,1.0,119999.0
51387,[Revival] DOA6 Hot Summer Costume - Tina,Windows,"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",1 user reviews,"Action,Gore,Violent,Nudity,Sexual Content",Action,8 GB RAM,https://store.akamai.steamstatic.com/public/sh...,2019.0,Nov,,,,,10.0,19000.0
40814,Banzai Escape 2 Subterranean - Maid Costumes,Windows,XenoAisam,Xenoaisam Studio,,"Action,Co-op,Arcade,Shooter,Third-Person Shoot...",Action,8 GB RAM,,2021.0,Oct,,,,,1.0,17499.0
117129,LEGO® DC Super-Villains Batman: The Animated S...,Windows,"TT Games,Feral Interactive (Mac)",Feral Interactive (Mac),Positive,Action,Action,4 GB RAM,https://store.akamai.steamstatic.com/public/sh...,2019.0,Mar,,,86.0,23.0,13.0,39900.0


#### extract `rated_by_pegi`

In [36]:
df_pegi = df_price.copy()
df_pegi['pegi_rated'] = np.where(df_pegi['rating']=='Rating for: PEGI', 1, 0)
df_pegi['age_rating'] = df_pegi['rating'].str.replace(r'\D','')
df_pegi = df_pegi.drop('rating',axis=1)
df_pegi.sample(5)

  df_pegi['age_rating'] = df_pegi['rating'].str.replace(r'\D','')


Unnamed: 0,title,platforms,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating
4120,Chuzzle Deluxe,"Windows,Mac OS","PopCap Games, Inc.","PopCap Games, Inc.,Electronic Arts",Very Positive,"Casual,Puzzle,Match 3,Cute,Singleplayer,Funny",Casual,256 MB RAM,2006.0,Aug,92.0,603.0,90.0,10.0,5.0,45999.0,0,
27786,Friendsim 2,"Windows,Mac OS",Studio June,Studio June,Positive,"Visual Novel,Choices Matter,Multiple Endings,S...","Adventure,Casual,Indie,RPG",2 GB RAM,2022.0,Apr,,,92.0,25.0,1.0,0.0,0,
52734,Between Two Cities - Stonemaier Games,Windows,DTDA Games,DTDA Games,1 user reviews,"Strategy,Board Game,Tabletop,Massively Multipl...","Indie,Massively Multiplayer,Strategy",1 GB RAM,2021.0,Aug,,,,,1.0,24999.0,0,7.0
114332,Cloud Climber - Fan Pack,Windows,Two Star Games,Two Star Games,Positive,"Adventure,Free to Play,Indie,Casual,Puzzle,Sur...","Adventure,Casual,Free to Play,Indie",,2021.0,Feb,,,93.0,15.0,1.0,24999.0,0,
27473,最后的大法师 光明之旅(Last Archmage Journey of Light),Windows,Flying Star Games,Flying Star Games,Positive,"JRPG,CRPG,Singleplayer,Visual Novel,Turn-Based...","Adventure,RPG,Strategy",3 GB RAM,2020.0,Jul,,,85.0,27.0,2.0,52999.0,0,


#### extract `windows`, `mac`, and `linux`

In [38]:
df_platform = df_pegi.copy()
df_platform['windows'] = np.where(df_platform['platforms'].str.contains('Windows',na=False), 1, 0)
df_platform['mac'] = np.where(df_platform['platforms'].str.contains('Mac OS',na=False), 1, 0)
df_platform['linux'] = np.where(df_platform['platforms'].str.contains('Linux',na=False), 1, 0)
df_platform['VR'] = np.where(df_platform['platforms'].str.contains('VR Supported',na=False), 1, 0)
df_platform = df_platform.drop('platforms',axis=1)
df_platform.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
12060,Superbrothers: Sword & Sworcery EP,"Capybara,Superbrothers,Jim Guthrie",Capybara Games,Mostly Positive,"Adventure,Indie,Pixel Graphics,Great Soundtrac...","Adventure,Indie",1 GB RAM,2012.0,Apr,,...,78.0,1284.0,2.0,59999.0,0,,1,1,1,0
92577,Winter's Empty Mask - Visual novel Demo,Eternal Night Studios,Eternal Night Studios,Positive,"Action,Indie,Casual,Gore,Visual Novel,Violent,...","Action,Casual,Indie",1 GB RAM,2019.0,Mar,,...,100.0,17.0,2.0,0.0,0,,1,1,1,0
113806,State of Decay - Breakdown,Undead Labs,Xbox Game Studios,Mostly Positive,"Zombies,Action,RPG,Simulation,Survival,Open Wo...","Action,RPG,Simulation",2 GB RAM,2013.0,Nov,,...,76.0,145.0,5.0,59999.0,0,18.0,1,0,0,0
51135,In the Service of Mrs. Claus,Choice of Games,Choice of Games,6 user reviews,"Adventure,RPG,Indie,Casual,Text-Based","Adventure,Casual,Indie,RPG",,2019.0,Dec,,...,,,1.0,32999.0,0,,1,1,1,0
9294,Hello Charlotte EP2: Requiem Aeternam Deo,etherane,etherane,Very Positive,"RPG,2D,Horror,Psychological Horror,Anime,Cute,...","Adventure,Indie,RPG",512 MB RAM,2016.0,Nov,97.0,...,100.0,36.0,8.0,,0,15.0,1,0,0,0
6007,新三國 漢室復興,Hermes Games,Hermes Games,Mostly Positive,"Strategy,RPG,RTS,Historical,War,3D,Wargame,Mil...","RPG,Strategy",2048 MB RAM,2022.0,Feb,,...,72.0,29.0,2.0,0.0,0,16.0,1,0,0,0
70179,Beneath The Deep,PaperMoon,PaperMoon,,"Strategy,Adventure,Puzzle,Horror,Psychological...","Indie,Strategy",8 GB RAM,2022.0,,,...,,,1.0,,0,,1,0,0,0
9344,Fish Tycoon 2: Virtual Aquarium,Last Day Of Work,Last Day Of Work,Mixed,Simulation,Simulation,512 MB RAM,2018.0,May,,...,56.0,185.0,1.0,0.0,0,,1,1,0,0
85378,"Fantasy Grounds - Odds & Ends, Volume 7 (Token...","SmiteWorks USA, LLC",,,"RPG,Indie,Strategy,Software","Indie,RPG,Strategy",1 GB RAM,2018.0,Aug,,...,,,1.0,64999.0,0,,1,1,0,0
106947,Baobabs Mausoleum Grindhouse Edition - Country...,Celery Emblem™,Celery Emblem™,3 user reviews,"Adventure,Platformer,Exploration,Action-Advent...","Adventure,Indie",2 GB RAM,2021.0,Feb,,...,,,4.0,102999.0,0,,1,1,0,0


#### remap `ram`

In [39]:
df_ram = df_platform.copy()
df_ram.loc[df_ram['ram'].str.lower().str.contains('9 gb', na=False),'ram'] = '9 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('8 gb|8000 mb|８ gb|8gb|8.0 gb|8\+ gb', na=False),'ram'] = '8 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('7 gb', na=False),'ram'] = '7 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('6 gb|6000 mb|６ gb', na=False),'ram'] = '6 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('5 gb|5000 mb', na=False),'ram'] = '5 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('4 gb|4096 mb|4000 mb|4gb|４ gb|4.00 gb|4g|4 ram gb', na=False),'ram'] = '4 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('3 gb|3000 mb|3 go mb|3gb', na=False),'ram'] = '3 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('2 gb|2048|2000 mb|2gb|1536 mb|2.0 gb|2000 gb|２ gb|1.5gb|2.0gb|2g', na=False),'ram'] = '2 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('1 gb|1024|1000 mb|1gb|800 mb|750 mb|1280 mb|700 mb|900 mb|１ gb|1\+', na=False),'ram'] = '1 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('512|500 mb|400 mb|512mb|600 mb|597 mb|320 mb|420 mb|500mb|349 mb', na=False),'ram'] = '512 MB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('256 gb|200 mb|300 mb|250 mb|256mb|220 mb|256 ram', na=False),'ram'] = '256 MB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('16.0 gb', na=False),'ram'] = '16 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('12.00 gb', na=False),'ram'] = '12 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('10 gb', na=False),'ram'] = '10 GB'
df_ram.loc[df_ram['ram'].str.lower().str.contains('128 mb|64m|70 mb|100 mb|150 mb|16 mb|8 mb|2 mb|1 mb|4 mb|50 mb|3 mb|6 mb|10 mb|128mb|80 mb|4 ram mb|30 mb|40 mb|25 mb|5 mb|120 mb|64 mb|90 mb|60 mb|95 mb|65 mb|64mb|16mb|100以上|32mb|97 mb|20 mb|99 mb|4mb|69 mb|59 mb', na=False),'ram'] = '<128 MB'
df_ram.loc[~df_ram['ram'].str.lower().str.contains('16 gb|12 gb|10 gb|9 gb|8 gb|7 gb|6 gb|5 gb|4 gb|3 gb|2 gb|1 gb|512 mb|256 mb|<128 mb', na=False),'ram'] = 'Unknown'
df_ram['ram'].value_counts()

4 GB       22820
2 GB       20294
Unknown    18386
1 GB       13700
<128 MB    13624
8 GB       10864
6 GB        3124
3 GB         963
5 GB         158
10 GB         30
9 GB           7
7 GB           5
12 GB          1
Name: ram, dtype: int64

In [40]:
df_ram.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
121335,Euro Truck Simulator 2 - FH Tuning Pack,SCS Software,SCS Software,Very Positive,"Simulation,Indie,Singleplayer","Indie,Simulation",4 GB,2020.0,Jun,,...,86.0,589.0,24.0,35900.0,0,3.0,1,1,1,0
59291,Dark Empire,Jan,Jan Company,1 user reviews,"Casual,Simulation,Indie","Casual,Indie,Simulation",1 GB,2017.0,Dec,,...,,,1.0,30999.0,0,,1,0,0,0
21047,Dark Fall 2: Lights Out,Darkling Room,THQ Nordic,Mostly Positive,"Adventure,Point & Click,Horror,Puzzle,Mystery,...",Adventure,<128 MB,2013.0,Dec,,...,70.0,62.0,1.0,45999.0,0,7.0,1,0,0,0
75482,Panzer Arena: Prologue,Osman Tsjardiwal,Osman Tsjardiwal,,"Action,Sports,Arcade,eSports,Shooter,Arena Sho...","Action,Indie,Racing,Sports",4 GB,2022.0,Jul,,...,,,1.0,,0,,1,0,0,0
79328,Sid Meier's Civilization VI Demo,"Firaxis Games,Aspyr (Mac),Aspyr (Linux)","2K,Aspyr (Mac),Aspyr (Linux)",Very Positive,"Strategy,Turn-Based Strategy,Historical,Multip...",Strategy,4 GB,2017.0,Mar,83.0,...,85.0,2062.0,12.0,0.0,0,12.0,1,0,0,0
67373,Grammarian Ltd Demo,"Algorocks,Simpleton",Algorocks,Positive,"Simulation,Education,Life Sim,Management,Casua...","Adventure,Casual,Indie,Simulation",3 GB,2022.0,Mar,,...,90.0,10.0,1.0,0.0,0,,1,0,0,0
90072,We Are Screwed!,Rarebyte,Rarebyte,,"Exploration,Roguelite,3D,Funny,Action,Action-A...","Action,Adventure,Casual,Indie",2 GB,2022.0,,,...,,,1.0,,0,,1,1,0,0
51285,Connectome:Pain Control,Melancholia Studio,Melancholia Studio,8 user reviews,"Action,Indie,Fighting,Robots,Great Soundtrack,...","Action,Indie",4 GB,2019.0,Jan,,...,,,4.0,59999.0,0,,1,0,0,0
4703,Guild of Dungeoneering Ultimate Edition,Gambrinous,Gambrinous,Mostly Positive,"Roguelike Deckbuilder,Roguelike,Deckbuilding,R...","Indie,RPG",2 GB,2015.0,Jul,,...,76.0,1565.0,5.0,108999.0,0,,1,1,0,0
51065,Breakout Invaders,DreamsSoftGames,DreamsSoftGames,6 user reviews,"Indie,Casual,Action","Action,Casual,Indie",1 GB,2015.0,Apr,,...,,,3.0,32999.0,0,,1,0,0,0


#### remap `overall_reviews`

In [41]:
df_or = df_ram.copy()
df_or.loc[~df_or['overall_reviews'].str.contains('Positive|Negative|Very|Overwhelmingly|Mixed|Mostly', na=False),'overall_reviews'] = '<10 reviews'
df_or['overall_reviews'].value_counts()

<10 reviews                62387
Very Positive              11779
Mixed                      10099
Positive                   10025
Mostly Positive             7321
Mostly Negative             1586
Overwhelmingly Positive      461
Negative                     245
Very Negative                 64
Overwhelmingly Negative        9
Name: overall_reviews, dtype: int64

In [42]:
df_or.sample(5)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
78630,Fantasy Grounds - Deadlands Reloaded: The Flood,"SmiteWorks USA, LLC",,<10 reviews,"RPG,Indie,Strategy,Utilities,Western,Turn-Base...","Indie,RPG,Strategy",1 GB,2016.0,Jun,,...,,,1.0,95999.0,0,,1,0,0,0
60734,The Message,Dream Games,Dream Games,<10 reviews,"Early Access,Action,Violent,Gore,Shooter,Beat ...","Action,Indie,Early Access",8 GB,2020.0,Dec,,...,,,1.0,95999.0,0,,1,0,0,0
41220,The Marauder Chronicles: Curse Over Valdria,Mambo Dancing Shrimp,Gamuzumi,<10 reviews,"Adventure,Sexual Content,Nudity,Visual Novel,F...",Adventure,<128 MB,2021.0,Oct,,...,,,3.0,39999.0,0,,1,0,0,0
38211,DYNASTY WARRIORS 9 Empires - Unisex Custom Cun...,"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",<10 reviews,"Strategy,Action,Simulation","Action,Simulation,Strategy",6 GB,2022.0,Jan,,...,,,9.0,57000.0,0,12.0,1,0,0,0
84144,Heroine Anthem Zero 2：Colorful Feather Pack IV,WindThunder Studio,Skywalker HK,<10 reviews,"Action,Adventure,RPG","Action,Adventure,RPG",6 GB,2022.0,Feb,,...,,,4.0,24999.0,0,,1,0,0,0


#### reformat and remap `tags`

In [49]:
def to_1D(series): #list feature function
 return pd.Series([x for _list in series for x in _list])

In [43]:
df_tags = df_or.copy()
df_tags.loc[df_tags['tags'].isna()==True,'tags'] = 'No Tags'
df_tags['tags'] = df_tags['tags'].str.split(',')
#df_tags['tags'] = df_tags['tags'].replace(',', '","')
#df_tags['tags'] = '["' + df_tags['tags'] + '"]'
df_tags.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
56044,INSPACE 2980,R_Games,R_Games,<10 reviews,"[Shoot 'Em Up, Bullet Hell, Space, Old School,...","Action,Adventure,Indie",2 GB,2018.0,Oct,,...,,,2.0,24999.0,0,,1,1,0,0
96793,Tiger Fighter 1931 Tora!Tora!Tora! MP086,TigerQiuQiu,TigerQiuQiu,<10 reviews,"[Action, Indie, Casual, Old School, Pixel Grap...","Action,Casual,Indie",4 GB,2021.0,Sep,,...,,,1.0,8499.0,0,,1,0,0,0
121851,Total War: WARHAMMER II,"CREATIVE ASSEMBLY,Feral Interactive (Mac),Fera...","SEGA,Feral Interactive (Mac),Feral Interactive...",Very Positive,"[Strategy, Fantasy, Turn-Based Strategy, RTS, ...","Action,Strategy",5 GB,2017.0,Sep,93.0,...,89.0,573.0,13.0,540999.0,0,16.0,1,1,1,0
76350,Super Catboy,Pixelpogo,Assemble Entertainment,<10 reviews,"[Precision Platformer, Pixel Graphics, Action-...","Action,Adventure,Indie",4 GB,2022.0,,,...,,,10.0,,0,,1,1,0,0
104839,Journey of Greed - Male Skin Pack,Dird Games,X.D. Network Inc.,<10 reviews,"[Strategy, Indie, Card Game, Board Game, Tradi...","Indie,Strategy",2 GB,2020.0,Mar,,...,,,3.0,24999.0,0,3.0,1,1,0,0
29311,佣兵战歌,酷呐网络科技,酷呐网络科技,Mixed,"[Strategy, Adventure, RPG, Strategy RPG, Rogue...","Adventure,Casual,Indie,RPG,Strategy",8 GB,2020.0,Jan,,...,47.0,42.0,2.0,64999.0,0,,1,0,0,0
16576,Altwaldheim: Town in Turmoil,Ascendancy Games,Ascendancy Games,Positive,"[City Builder, Roguelite, Building, Management...","Casual,Indie,Simulation,Strategy",<128 MB,2020.0,Aug,,...,91.0,12.0,2.0,69999.0,0,,1,0,0,0
93067,Fay's Factory,egor dorogov,egor dorogov,<10 reviews,"[RPG, Turn-Based Strategy, Capitalism, Deckbui...","Indie,RPG,Strategy",4 GB,2022.0,,,...,,,1.0,,0,,1,0,0,0
741,RISK: Global Domination,SMG Studio,SMG Studio,Very Positive,"[Free to Play, Board Game, Multiplayer, Strate...","Casual,Free to Play,Strategy",2 GB,2020.0,Feb,83.0,...,81.0,541.0,9.0,0.0,0,7.0,1,1,0,0
62391,"Grass Cutters Academy - 50,000 Upgrade Points",ColloseusX,Phat Phrog Studios,<10 reviews,"[Free to Play, Indie, Casual]","Casual,Free to Play,Indie",<128 MB,2021.0,Feb,,...,,,1.0,59999.0,0,,1,0,0,0


In [51]:
to_1D(df_tags['tags']).value_counts().head(50)

Indie                 54882
Action                41227
Casual                35092
Adventure             34938
Singleplayer          34624
Simulation            22556
Strategy              21265
RPG                   20146
2D                    18546
Atmospheric           12636
Puzzle                11747
Pixel Graphics        10670
3D                    10459
Story Rich            10377
Fantasy                9994
Colorful               9534
Multiplayer            9030
Early Access           8974
Exploration            8696
Cute                   8223
First-Person           8047
Arcade                 7827
No Tags                7770
Funny                  7176
Shooter                7117
Free to Play           7114
Sci-fi                 7022
Platformer             6859
Retro                  6745
Family Friendly        6612
Anime                  6372
Horror                 6243
Relaxing               6208
Action-Adventure       6132
Violent                6071
Difficult           

#### reformat and remap `genre`

In [45]:
df_genre = df_tags.copy()
df_genre.loc[df_genre['genre'].isna()==True,'genre'] = 'No Genre'
df_genre['genre'] = df_genre['genre'].str.split(',')
#df_tags['tags'] = df_tags['tags'].replace(',', '","')
#df_tags['tags'] = '["' + df_tags['tags'] + '"]'
df_genre.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
47192,OASE - Other Age Second Encounter,Zeiva Inc,Zeiva Inc,<10 reviews,"[Adventure, Indie, Casual, Simulation, Visual ...","[Adventure, Casual, Indie, Simulation]",2 GB,2015.0,Sep,,...,,,1.0,135999.0,0,,1,0,0,0
36564,The Binding of YOU,StarSystemStudios,StarSystemStudios,Mostly Negative,"[Casual, Action, Indie, Strategy]","[Action, Casual, Indie, Strategy]",2 GB,2019.0,Sep,,...,25.0,12.0,1.0,8499.0,0,,1,0,0,0
30070,Soccer Pinball Thrills,"Fancy Bytes, Reactor",United Independent Entertainment,Mixed,"[Casual, Simulation, Sports, Pinball]","[Casual, Simulation, Sports]",2 GB,2015.0,Jun,,...,40.0,10.0,2.0,35999.0,0,,1,0,0,0
2492,The Sims 3 - Movie Stuff,The Sims Studio,,Mixed,[Simulation],[Simulation],5 GB,2013.0,Sep,,...,63.0,22.0,17.0,282000.0,0,12.0,1,0,0,0
67387,Song of Iron Soundtrack,Resting Relic,,<10 reviews,[No Tags],[No Genre],Unknown,2021.0,Sep,,...,,,,52999.0,0,,0,0,0,0
111739,He Will Shoot,Boom Games,Boom Games,Mixed,"[Zombies, Post-apocalyptic, Arena Shooter, Thi...","[Action, Adventure, Indie, RPG, Simulation]",4 GB,2021.0,Dec,,...,45.0,22.0,1.0,17499.0,0,,1,0,0,0
77559,Fantasy Grounds - A04: Forest for the Trees (P...,"SmiteWorks USA, LLC",,<10 reviews,"[RPG, Indie, Strategy, Fantasy, Medieval, Part...","[Indie, RPG, Strategy]",1 GB,2016.0,Jul,,...,,,1.0,48999.0,0,,1,0,0,0
900,NBA 2K21,Visual Concepts,2K,Mixed,"[Basketball, Sports, Simulation, Character Cus...","[Simulation, Sports]",4 GB,2020.0,Sep,42.0,...,51.0,99.0,9.0,699000.0,0,3.0,1,0,0,0
103524,DOA6 Rig Deluxe Costume,"KOEI TECMO GAMES CO., LTD.","KOEI TECMO GAMES CO., LTD.",<10 reviews,"[Action, Gore, Violent, Nudity, Sexual Content]",[Action],8 GB,2019.0,Aug,,...,,,10.0,19000.0,0,16.0,1,0,0,0
10495,Valentine's Day Set,"Team NINJA,KOEI TECMO GAMES CO., LTD.",,Positive,[Action],[Action],2 GB,2016.0,Feb,,...,100.0,12.0,8.0,159999.0,0,16.0,1,0,0,0


In [56]:
to_1D(df_genre['genre']).value_counts()

Indie                    60589
Action                   39905
Casual                   32763
Adventure                32583
Simulation               21664
Strategy                 20129
RPG                      19297
No Genre                  8359
Early Access              7393
Free to Play              6726
Sports                    4460
Racing                    3738
Massively Multiplayer     2844
Design & Illustration     2020
Web Publishing            1613
Utilities                 1184
Animation & Modeling       620
Education                  549
Software Training          387
Video Production           384
Game Development           360
Audio Production           310
Photo Editing              263
Accounting                  19
Movie                        3
Documentary                  1
Episodic                     1
Short                        1
Tutorial                     1
360 Video                    1
dtype: int64

#### reformat and remap `developer`

In [57]:
df_dev = df_genre.copy()
df_dev.loc[df_dev['developer'].isna()==True,'developer'] = 'Unknown'
df_dev['developer'] = df_dev['developer'].str.split(',')
#df_tags['tags'] = df_tags['tags'].replace(',', '","')
#df_tags['tags'] = '["' + df_tags['tags'] + '"]'
df_dev.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
21987,Ebi-Hime Bundle,[Unknown],,<10 reviews,[No Tags],[No Genre],Unknown,,,,...,,,,1034380.0,0,,1,0,1,0
54756,unBorn,[Frenetic Gaming],Frenetic Gaming,<10 reviews,"[Indie, Casual, Simulation]","[Casual, Indie, Simulation]",2 GB,2018.0,Jul,,...,,,1.0,8499.0,0,,1,0,0,0
54419,ELMIA Original Soundtrack,[AECRNIA],AnelaGamesStudio,<10 reviews,"[Action, Indie, Casual, Fantasy, Story Rich, G...","[Action, Casual, Indie]",Unknown,2017.0,Dec,,...,,,2.0,59999.0,0,,1,0,0,0
9305,Super Mustache,[Redro Games],Back To Basics Gaming,Mostly Positive,"[Indie, Adventure, Action, Platformer, Casual,...","[Action, Adventure, Casual, Indie]",<128 MB,2016.0,Jan,,...,74.0,276.0,1.0,8499.0,0,,1,0,0,0
8498,RISK: Global Domination - Countries & Continen...,[SMG Studio],,Positive,"[Strategy, Free to Play, Casual]","[Casual, Free to Play, Strategy]",Unknown,2020.0,Sep,,...,80.0,10.0,9.0,32999.0,0,7.0,1,1,0,0
116206,Intrude,[Michal Kruba],Michal Kruba,Very Positive,"[Action, Indie, FPS, Shooter, Retro, Pixel Gra...","[Action, Indie]",1 GB,2016.0,Aug,,...,84.0,176.0,1.0,35999.0,0,,1,0,0,0
8816,Euro Truck Simulator 2 - Swiss Paint Jobs Pack,[SCS Software],SCS Software,Positive,"[Simulation, Indie]","[Indie, Simulation]",4 GB,2016.0,Jul,,...,95.0,42.0,23.0,9200.0,0,3.0,1,1,1,0
56630,送小鸡回家 sendchickenhome,[独自一人游戏工作室],念如饺,<10 reviews,"[Indie, Casual, Action, Adventure, Puzzle]","[Action, Adventure, Casual, Indie]",8 GB,2020.0,Apr,,...,,,2.0,8499.0,0,,1,0,0,0
114075,Event-D,[Freedintale Studio],Freedintale Studio,Positive,"[Adventure, Visual Novel, Sci-fi, Futuristic, ...",[Adventure],2 GB,2018.0,Dec,,...,81.0,22.0,2.0,39999.0,0,,1,0,0,0
4216,9-nine-:Episode 1,[PALETTE],Sekai Project,Very Positive,"[Visual Novel, Anime, Sexual Content, Nudity, ...","[Casual, Simulation]",4 GB,2019.0,Jan,95.0,...,92.0,13.0,3.0,143942.0,0,,1,0,0,0


In [59]:
to_1D(df_dev['developer']).value_counts().head(50)

Unknown                           6112
 LLC                              2463
SmiteWorks USA                    1965
 LTD.                             1831
KOEI TECMO GAMES CO.              1647
Ubisoft - San Francisco           1517
TigerQiuQiu                       1443
 Ltd.                              830
 Inc.                              667
CAPCOM Co.                         506
Dovetail Games                     381
N3V Games                          335
Feral Interactive (Mac)            307
Nihon Falcom                       272
Choice of Games                    253
Feral Interactive (Linux)          240
Idea Factory                       225
Square Enix                        223
Milestone S.r.l.                   213
 Inc                               210
Tamsoft                            205
Inc.                               203
Paradox Development Studio         200
Arc System Works                   175
Gotcha Gotcha Games                168
Creobit                  

#### reformat and remap `publisher`

In [60]:
df_pub = df_dev.copy()
df_pub.loc[df_pub['publisher'].isna()==True,'publisher'] = 'Unknown'
df_pub['publisher'] = df_pub['publisher'].str.split(',')
df_pub.sample(10)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,...,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
21106,Animal and Farmer,[Unknown],[Unknown],<10 reviews,[No Tags],[No Genre],Unknown,,,,...,,,,184498.0,0,,1,0,0,0
37843,Bridge to Another World: Christmas Flight Coll...,[Friendly Fox],[Big Fish Games],<10 reviews,"[Hidden Object, Puzzle, Point & Click, Female ...","[Adventure, Casual]",1 GB,2022.0,Jan,,...,,,1.0,89999.0,0,,1,0,0,0
102293,Legion of Scorn,[Phoenix Resurrection Games],[Phoenix Resurrection Games],<10 reviews,"[Indie, Retro, Top-Down Shooter, Twin Stick Sh...",[Indie],2 GB,2019.0,May,,...,,,1.0,8499.0,0,,1,0,0,0
56987,CADE PRIME,[Archor Wright],[Archor Games],<10 reviews,"[Casual, Puzzle, Puzzle Platformer, 3D, Parkou...",[Casual],Unknown,2021.0,Sep,,...,,,1.0,8499.0,0,,1,0,0,0
7899,LOGistICAL 3: Earth,[Sacada],[Sacada],Very Positive,"[Strategy, Indie, Casual, Puzzle, Resource Man...","[Casual, Indie, Strategy]",4 GB,2020.0,Jul,,...,85.0,92.0,10.0,169999.0,0,,1,0,0,0
26773,Astronaut: The Moon Eclipse,[AppChu Indie Games],[AppChu Indie Games],Positive,"[Action, Casual, Indie, 2D]","[Action, Casual, Indie]",<128 MB,2018.0,Dec,,...,100.0,12.0,27.0,17499.0,0,,1,0,0,0
51444,Red Algorithm - Book,[Eugene Andreev],[Eugene Andreev],<10 reviews,"[Action, RPG, Indie, Free to Play]","[Action, Indie, RPG]",<128 MB,2021.0,Oct,,...,,,2.0,17499.0,0,,1,0,0,0
76576,Call of Duty®: Ghosts - Space Cats Pack,[Infinity Ward],[Activision],<10 reviews,[Action],[Action],6 GB,2014.0,Mar,,...,,,5.0,18999.0,0,16.0,1,0,0,0
80014,Hermodr,[Regday NULL],[Regday NULL],<10 reviews,"[Action, Indie, Gore, Violent, Retro, FPS, Pix...","[Action, Indie]",2 GB,,,,...,,,2.0,,0,,1,0,1,0
54397,Bubbles & Pearls,[Hot Cocoa Games],[Hot Cocoa Games],<10 reviews,"[Sports, Exploration, Immersive Sim, 3D, 3D Vi...","[Casual, Simulation, Sports]",4 GB,2022.0,Mar,,...,,,1.0,39999.0,0,,1,0,0,1


In [61]:
to_1D(df_pub['publisher']).value_counts().head(50)

Unknown                       14772
 LTD.                          1517
Degica                         1509
TigerQiuQiu                    1443
KOEI TECMO GAMES CO.           1385
 Inc.                          1094
 Ltd.                           769
Dovetail Games - Trains         561
BANDAI NAMCO Entertainment      531
 LLC                            528
CAPCOM Co.                      495
Paradox Interactive             458
SEGA                            456
Square Enix                     442
Big Fish Games                  405
Ubisoft                         365
Electronic Arts                 347
N3V Games                       347
XSEED Games                     341
Marvelous USA                   334
THQ Nordic                      318
D3 PUBLISHER                    315
Feral Interactive (Mac)         284
Choice of Games                 251
2K                              249
PlayWay S.A.                    241
Slitherine Ltd.                 236
Dovetail Games - Flight     

## data understanding

In [63]:
pd.options.display.max_columns=None

In [64]:
df_final = df_pub.copy()
df_final.sample(5)

Unnamed: 0,title,developer,publisher,overall_reviews,tags,genre,ram,year,month,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,age_rating,windows,mac,linux,VR
5733,Craft The World - Heroes,[Dekovir Entertainment],[Unknown],Mostly Positive,"[Strategy, RPG, Indie, Simulation]","[Indie, RPG, Simulation, Strategy]",1 GB,2020.0,Jun,,,72.0,61.0,11.0,32999.0,0,,1,1,0,0
71754,Citizens: Far Lands Demo,[Redkar Limited],"[Redkar Limited, Hawthorn Games]",<10 reviews,"[Resource Management, Minimalist, City Builder...","[Simulation, Strategy]",4 GB,2021.0,Sep,,,,,8.0,0.0,0,,1,0,0,0
8784,HELLDIVERS™ - Vehicles Pack,[Arrowhead Game Studios],[PlayStation PC LLC],Very Positive,[Action],[Action],4 GB,2015.0,Dec,,,89.0,67.0,17.0,52000.0,0,16.0,1,0,0,0
77128,Cube Conflict,[Jean Onche],[Jean Onche],<10 reviews,"[Early Access, FPS, Fighting, Shooter, Experim...","[Action, Free to Play, Indie, Early Access]",2 GB,2022.0,Jul,,,,,2.0,0.0,0,,1,0,0,0
11582,Distant Space 2,[PixelMouse],[PixelMouse],Very Positive,"[Action, Adventure, Indie, Casual]","[Action, Adventure, Casual, Indie]",2 GB,2017.0,Nov,,,82.0,102.0,1.0,8499.0,0,,1,0,0,0


In [65]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103976 entries, 0 to 121903
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   title                103975 non-null  object 
 1   developer            103976 non-null  object 
 2   publisher            103976 non-null  object 
 3   overall_reviews      103976 non-null  object 
 4   tags                 103976 non-null  object 
 5   genre                103976 non-null  object 
 6   ram                  103976 non-null  object 
 7   year                 95020 non-null   float64
 8   month                92397 non-null   object 
 9   user_rating_all      5360 non-null    float64
 10  total_review_all     5360 non-null    float64
 11  user_rating_recent   41589 non-null   float64
 12  total_review_recent  41589 non-null   float64
 13  supported_language   94557 non-null   float64
 14  price                95861 non-null   float64
 15  pegi_rated       

In [66]:
df_final.describe()

Unnamed: 0,year,user_rating_all,total_review_all,user_rating_recent,total_review_recent,supported_language,price,pegi_rated,windows,mac,linux,VR
count,95020.0,5360.0,5360.0,41589.0,41589.0,94557.0,95861.0,103976.0,103976.0,103976.0,103976.0,103976.0
mean,2018.774626,84.747388,12738.07,76.794248,225.190531,4.191144,68111.32,0.0,0.96228,0.25528,0.152141,0.008444
std,4.564053,11.84031,104502.4,18.307801,4642.111484,5.136509,136226.8,0.0,0.19052,0.436021,0.359159,0.091504
min,1705.0,12.0,10.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,80.0,779.75,66.0,18.0,1.0,17499.0,0.0,1.0,0.0,0.0,0.0
50%,2019.0,88.0,2198.0,81.0,40.0,2.0,39999.0,0.0,1.0,0.0,0.0,0.0
75%,2021.0,93.0,6605.0,91.0,123.0,6.0,82999.0,0.0,1.0,1.0,0.0,0.0
max,3021.0,100.0,6464937.0,100.0,555285.0,29.0,8784144.0,0.0,1.0,1.0,1.0,1.0


## export csv

In [68]:
df_final.to_csv('for_EDA.csv', index=False)