In [1]:
import pandas as pd
import numpy as np
import scipy

from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

from kmodes.kprototypes import KPrototypes

In [2]:
video_game_sales = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')
video_game_sales['Name'] = video_game_sales['Name'].str.lower()

In [3]:
video_game_sales['Name'] = video_game_sales['Name'].str.lower()
#impute missing value
video_game_sales_imp=pd.concat([video_game_sales.select_dtypes(include=np.number).fillna(-0.1),
                              video_game_sales.select_dtypes(exclude=np.number).fillna('None')],
                              axis=1)
video_game_sales_imp.loc[video_game_sales_imp['User_Score'].str.isnumeric()!=True, 'User_Score']=-0.1
video_game_sales_imp['User_Score']=video_game_sales_imp['User_Score'].astype('float')

In [4]:
video_game_sales_label=video_game_sales_imp.filter(regex='Sales$', axis=1)
video_game_sales_percent=video_game_sales_label.copy()
video_game_sales_percent['NA_Sales']=video_game_sales_percent['NA_Sales']/video_game_sales_percent['Global_Sales']
video_game_sales_percent['EU_Sales']=video_game_sales_percent['EU_Sales']/video_game_sales_percent['Global_Sales']
video_game_sales_percent['JP_Sales']=video_game_sales_percent['JP_Sales']/video_game_sales_percent['Global_Sales']
video_game_sales_percent['Other_Sales']=video_game_sales_percent['Other_Sales']/video_game_sales_percent['Global_Sales']

video_game_sales_percent.drop(['Global_Sales'], axis=1, inplace=True)

In [5]:
kmeans_kwargs={
    'init':'random',
    'n_init':10,
    'max_iter': 300,
    'random_state':42
}

kmeans = KMeans(n_clusters=5, **kmeans_kwargs)
kmeans.fit(video_game_sales_percent)
sales_label = kmeans.labels_

video_game_sales_percent['sales_label']=pd.Series(sales_label).replace(
    {0:'JP&NA', 1:'NA', 2:'JP', 3:'NA&EU',4:'EU'})

In [6]:
#Deal wiht the year information and add year label for every 5 years interval between 1980 and 2020
def year_label(x):
    x_label=0.125
    if x>=1985 and x<1990:
        x_label=0.25
    elif x>=1990 and x<1995:
        x_label=0.375
    elif x>=1995 and x<2000:
        x_label=0.5
    elif x>=2000 and x<2005:
        x_label=0.625
    elif x>=2005 and x<2010:
        x_label=0.75
    elif x>=2010 and x<2015:
        x_label=0.875
    elif x>=2015:
        x_label=1
    return x_label

video_game_sales_imp['interval_2020']=video_game_sales['Year_of_Release'].apply(year_label)

In [7]:
video_game_sales_num = video_game_sales_imp[video_game_sales_imp.select_dtypes(include=np.number).columns.tolist()]
min_max_scaler=preprocessing.MinMaxScaler()
video_game_sales_scaled=pd.DataFrame(min_max_scaler.fit_transform(video_game_sales_num))
video_game_sales_scaled.columns=video_game_sales_num.columns

games_train_num=video_game_sales_scaled[['Critic_Score','User_Count', 'User_Score']]
games_train_num['interval_years_2020']=video_game_sales['Year_of_Release'].apply(year_label)
games_train_num['gloabl_sales_log']=(np.log(video_game_sales['Global_Sales'])-np.log(video_game_sales['Global_Sales']).min()+0.01)/10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
games_train_cat=video_game_sales_imp.drop(columns=['Name', 'Publisher', 'Developer']).select_dtypes(exclude=np.number)
games_train_cat['regional_sales_label']=pd.Series(sales_label).replace(
    {0:'JP&NA', 1:'NA', 2:'JP', 3:'NA&EU',4:'EU'})

In [9]:
# Convert dataframe to matrix
games_train = pd.concat([games_train_cat, games_train_num], axis=1)
games_matrix=games_train.to_numpy()

In [10]:
catColumnsPos=[games_train.columns.get_loc(col) for col in games_train_cat.columns.to_list()]
kprototype = KPrototypes(n_jobs = -1, n_clusters = 27, init = 'Huang', random_state = 0)
kprototype.fit_predict(games_matrix, categorical = catColumnsPos)

array([ 3, 10,  5, ..., 17,  0, 17], dtype=uint16)

In [11]:
game_label=kprototype.labels_
video_game_group=video_game_sales[['Name']]
video_game_group['game_label']=pd.Series(game_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
video_game_group

Unnamed: 0,Name,game_label
0,wii sports,3
1,super mario bros.,10
2,mario kart wii,5
3,wii sports resort,3
4,pokemon red/pokemon blue,23
...,...,...
16714,samurai warriors: sanada maru,12
16715,lma manager 2007,24
16716,haitaka no psychedelica,17
16717,spirits & spells,0
