In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer



In [2]:
df = pd.read_csv('games.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies,Genres List
0,0,Galactic Bowling,2008-10-21,10000.0,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,"['Casual', 'Indie', 'Sports']"
1,1,Train Bandit,2017-10-12,10000.0,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,"['Action', 'Indie']"
2,2,Jolt Project,2021-11-17,10000.0,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",No tags available for this game,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,"['Action', 'Adventure', 'Indie', 'Strategy']"
3,3,Henosis™,2020-07-23,10000.0,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,"['Adventure', 'Casual', 'Indie']"
4,4,Two Weeks in Painland,2020-02-03,10000.0,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,"['Adventure', 'Indie']"


In [3]:
df.drop(columns='Unnamed: 0', inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85075 entries, 0 to 85074
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Name                        85075 non-null  object 
 1   Release date                85075 non-null  object 
 2   Estimated owners            85075 non-null  float64
 3   Peak CCU                    85075 non-null  int64  
 4   Required age                85075 non-null  int64  
 5   Price                       85075 non-null  float64
 6   DLC count                   85075 non-null  int64  
 7   About the game              85075 non-null  object 
 8   Supported languages         85075 non-null  object 
 9   Full audio languages        85075 non-null  object 
 10  Header image                85075 non-null  object 
 11  Support email               85075 non-null  object 
 12  Windows                     85075 non-null  bool   
 13  Mac                         850

In [5]:
df['Release date'] = pd.to_datetime(df['Release date'])
df[['Name', 'About the game', 'Supported languages','Full audio languages', 'Header image','Support email','Developers','Publishers','Categories','Genres','Tags','Screenshots','Movies','Genres List']] \
    = df[['Name', 'About the game','Supported languages','Full audio languages','Header image','Support email','Developers','Publishers','Categories','Genres','Tags','Screenshots','Movies','Genres List']].astype('string')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85075 entries, 0 to 85074
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Name                        85075 non-null  string        
 1   Release date                85075 non-null  datetime64[ns]
 2   Estimated owners            85075 non-null  float64       
 3   Peak CCU                    85075 non-null  int64         
 4   Required age                85075 non-null  int64         
 5   Price                       85075 non-null  float64       
 6   DLC count                   85075 non-null  int64         
 7   About the game              85075 non-null  string        
 8   Supported languages         85075 non-null  string        
 9   Full audio languages        85075 non-null  string        
 10  Header image                85075 non-null  string        
 11  Support email               85075 non-null  string    

In [7]:
def preprocess_text(text):
    text = text.lower().strip()
    return text

df['About the game'] = df['About the game'].apply(preprocess_text)
df['About the game']


0        galactic bowling is an exaggerated and stylize...
1        the law!! looks to be a showdown atop a train....
2        jolt project: the army now has a new robotics ...
3        henosis™ is a mysterious 2d platform puzzler w...
4        about the game play as a hacker who has arrang...
                               ...                        
85070    marshal mannerheim’s saloon car is the train c...
85071    beer run is an indie game created to steal bee...
85072    a small 'horror' narrative game about isolatio...
85073    path of survivors is a multi-class auto-battle...
85074    meet mariah, maria, and missy a group of colle...
Name: About the game, Length: 85075, dtype: object

In [8]:
def process_genres(genres):
    return [genre.strip() for genre in genres.split(',')]

df['Genres'] = df['Genres'].apply(process_genres)

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

df = pd.concat([df, genre_df], axis=1).drop(columns=['Genres'])
df.head()

Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,Full audio languages,...,Short,Simulation,Software Training,Sports,Strategy,Tutorial,Utilities,Video Production,Violent,Web Publishing
0,Galactic Bowling,2008-10-21,10000.0,0,0,19.99,0,galactic bowling is an exaggerated and stylize...,['English'],[],...,0,0,0,1,0,0,0,0,0,0
1,Train Bandit,2017-10-12,10000.0,0,0,0.99,0,the law!! looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",[],...,0,0,0,0,0,0,0,0,0,0
2,Jolt Project,2021-11-17,10000.0,0,0,4.99,0,jolt project: the army now has a new robotics ...,"['English', 'Portuguese - Brazil']",[],...,0,0,0,0,1,0,0,0,0,0
3,Henosis™,2020-07-23,10000.0,0,0,5.99,0,henosis™ is a mysterious 2d platform puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",[],...,0,0,0,0,0,0,0,0,0,0
4,Two Weeks in Painland,2020-02-03,10000.0,0,0,0.0,0,about the game play as a hacker who has arrang...,"['English', 'Spanish - Spain']",[],...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = df.drop(columns=['Positive','Negative'])
y = df[['Positive', 'Negative']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [10]:
train_data = pd.concat([pd.DataFrame(X_train), y_train], axis = 1)
test_data = pd.concat([pd.DataFrame(X_test), y_test], axis = 1)
train_data.to_csv('train_data.csv', index = False)
test_data.to_csv('test_data.csv', index = False)