In [3]:
#!pip install category_encoders

In [4]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from category_encoders import MEstimateEncoder

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.columns)
print(test.columns)

Index(['name', 'seller', 'price', 'vehicleType', 'yearOfRegistration',
       'gearbox', 'powerPS', 'model', 'kilometer', 'monthOfRegistration',
       'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures',
       'postalCode'],
      dtype='object')
Index(['name', 'seller', 'vehicleType', 'yearOfRegistration', 'gearbox',
       'powerPS', 'model', 'kilometer', 'monthOfRegistration', 'fuelType',
       'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures',
       'postalCode'],
      dtype='object')


In [6]:
train.head()

Unnamed: 0,name,seller,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode
0,Opel_Astra_G_Cabrio,privat,3800,cabrio,2001,,0,astra,150000,0,,opel,nein,2016-03-08,0,26789
1,Alfa_Romeo_166_2.0_Twin_Spark_EFH_Klimaautomat...,privat,590,suv,2017,automatik,50,omega,150000,3,benzin,mercedes_benz,nein,2016-03-26,0,35516
2,BMW_525d,privat,4489,limousine,2002,automatik,163,5er,150000,2,diesel,bmw,nein,2016-04-05,0,56751
3,Volkswagen_Fox_1.2,privat,1400,kleinwagen,2005,manuell,54,fox,150000,10,benzin,volkswagen,nein,2016-03-16,0,47198
4,Mercedes_Benz_220_CDI,privat,9300,limousine,2005,manuell,150,andere,150000,2,diesel,mercedes_benz,nein,2016-03-11,0,94344


In [7]:
test.head()

Unnamed: 0,name,seller,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode
0,Peugeot_807_HDi_130_NAVTECH_ON_BOARD,privat,bus,2004,manuell,128,andere,150000,3,diesel,peugeot,nein,4/5/16,0,85764
1,Mercedes_C_200_Kompressor_Sportcoupe_zum_verka...,privat,coupe,2001,automatik,163,c_klasse,150000,5,benzin,mercedes_benz,,3/8/16,0,21149
2,Volkswagen_Golf_VI_1.4_TSI_Comfortline,privat,limousine,2009,automatik,160,golf,30000,2,benzin,volkswagen,nein,3/21/16,0,58089
3,VW_Polo_9n_1.2l__Top_!!!,privat,kleinwagen,2003,manuell,64,polo,150000,5,benzin,volkswagen,nein,3/25/16,0,49685
4,Hyundai_Santa_Fe_2.0_CRDi_4WD_4WD_Edition+,privat,suv,2004,manuell,113,santa,150000,6,diesel,hyundai,nein,3/13/16,0,54472


In [8]:
x = train.drop(['price'], axis = 1)
y = train.price

In [9]:
print('Row x Cols of df:')
print(x.shape)

#finding num missing
print('Sum of NA in each col:')
num_missing = x.isnull().sum()
print(num_missing[num_missing>0])

Row x Cols of df:
(70000, 15)
Sum of NA in each col:
vehicleType           6334
gearbox               3188
model                 3259
fuelType              5432
notRepairedDamage    12833
dtype: int64


In [10]:
category = x.dtypes == 'object'
categorical_cols = list(category[category].index)

print('categorical variables')
print(categorical_cols)

categorical variables
['name', 'seller', 'vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated']


In [11]:
#find the num of unique entries in each categorical 
categorical_unique = list(map(lambda col: x[col].nunique(), categorical_cols))
d = dict(zip(categorical_cols, categorical_unique))

sorted(d.items(), key=lambda x: x[1])

[('seller', 1),
 ('gearbox', 2),
 ('notRepairedDamage', 2),
 ('fuelType', 7),
 ('vehicleType', 8),
 ('brand', 40),
 ('dateCreated', 78),
 ('model', 243),
 ('name', 50666)]

In [12]:
x.head()

Unnamed: 0,name,seller,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode
0,Opel_Astra_G_Cabrio,privat,cabrio,2001,,0,astra,150000,0,,opel,nein,2016-03-08,0,26789
1,Alfa_Romeo_166_2.0_Twin_Spark_EFH_Klimaautomat...,privat,suv,2017,automatik,50,omega,150000,3,benzin,mercedes_benz,nein,2016-03-26,0,35516
2,BMW_525d,privat,limousine,2002,automatik,163,5er,150000,2,diesel,bmw,nein,2016-04-05,0,56751
3,Volkswagen_Fox_1.2,privat,kleinwagen,2005,manuell,54,fox,150000,10,benzin,volkswagen,nein,2016-03-16,0,47198
4,Mercedes_Benz_220_CDI,privat,limousine,2005,manuell,150,andere,150000,2,diesel,mercedes_benz,nein,2016-03-11,0,94344


In [13]:
#seller: drop since one unique
#model: drop
#name: drop 
#notRepairedDamage: drop 
#gearbox, fuelType, vehicleType: impute and one hot


#brand: ordinal encode?

#dropping seller, model, name 
bad_cols = ['seller','name', 'nrOfPictures']
#bad_cols1 = ['seller','name', 'nrOfPictures']

x = x.drop(bad_cols, axis = 1)

#good_cols = ['gearbox','fuelType','vehicleType']

In [14]:
#parsing dateCreated
x['dateCreated_parsed'] = pd.to_datetime(x['dateCreated'], format='%Y-%m-%d')

In [15]:
x['dateCreated_parsed'].dtype

dtype('<M8[ns]')

In [16]:
#changing date into 3 separate columns of month, day, year as int
x['date_month'] = x['dateCreated_parsed'].dt.month
x['date_day'] = x['dateCreated_parsed'].dt.day
x['date_year'] = x['dateCreated_parsed'].dt.year

x = x.drop(['dateCreated','dateCreated_parsed'], axis = 1)

In [17]:
#x = x.drop(['brand'], axis = 1)

In [18]:
#first split the training data using train test split 
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state = 0)

In [19]:
x_train1 = x_train.copy()
x_test1 = x_test.copy()

x_train = x_train.drop('model',axis=1)
x_test = x_test.drop('model',axis=1)

In [20]:
print("Shape of x_train: ")
print(x_train.shape)

Shape of x_train: 
(56000, 13)


In [21]:
numerical_cols = [col for col in x_train.columns if x_train[col].dtype in ['int64','float64']]
categorical_cols = [col for col in x_train.columns if x_train[col].dtype == 'object']

all_cols = numerical_cols + categorical_cols

x_train = x_train[all_cols]

In [22]:
#manual OneHotEncoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(x_train[categorical_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(x_test[categorical_cols]))

OH_cols_train.index = x_train.index
OH_cols_test.index = x_test.index

num_x_train = x_train.drop(categorical_cols, axis = 1)
num_x_test = x_test.drop(categorical_cols, axis = 1)

OH_x_train = pd.concat([num_x_train, OH_cols_train], axis = 1)
OH_x_test = pd.concat([num_x_test, OH_cols_test], axis = 1)

In [23]:
#creating a preprocessor to impute missing vals and one hot encode for categorical
#preprocessor = Pipeline([
#    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#preprocessor.fit(x_train)

#x_train_transformed = pd.DataFrame(preprocessor.transform(x_train))
#x_test_transformed = pd.DataFrame(preprocessor.transform(x_test))

In [24]:
#null_index= x_train.isnull().index
#null_index
#print(num_miss[num_miss>0])

In [25]:
####   TRANSFORMER   #####
#transform good categorical columns by imputing and one hot encoding 
#numerical_transformer = SimpleImputer()

#categorical_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('onehot', OneHotEncoder(handle_unknown='ignore'))
#])

#preprocessor does not need numerical since it does not have any NAs
#preprocessor = ColumnTransformer( 
#    transformers = [
#        ('num', numerical_transformer, numerical_cols),
#        ('cat', categorical_transformer, categorical_cols)
#])

#model
#model = XGBRegressor(n_estimators=500)
#model = RandomForestRegressor(n_estimators=100, random_state=0)

#pipeline
#pipeline = Pipeline(steps = [('preprocessor', preprocessor),
#                            ('model', model)
#                            ])

#fitting and predicting
#pipeline.fit(x_train, y_train,
            # model__early_stopping_rounds=5,
            # model__eval_set=[(x_test,y_test)],
            # model__verbose=False)
#pipeline.fit(x_train,y_train)

#predictions = pipeline.predict(x_test)

In [26]:
#create a function that runs XGBoost and returns MSE for test data set 
def xgboost_mse(x_train, x_test, y_train, y_test):
    model = XGBRegressor(n_estimators=500)
    model.fit(x_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(x_test,y_test)],
             verbose=False)
    predictions = model.predict(x_test)
    error = mean_squared_error(y_test, predictions)
    return error

In [27]:
#create a function that runs RandomForest and returns MSE for test data set
def RandomForest_mse(x_train, x_test, y_train, y_test):
    model = RandomForestRegressor(random_state=0)
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    error = mean_squared_error(y_test, pred)
    return error

In [28]:
#error_RF = RandomForest_mse(OH_x_train, OH_x_test, y_train, y_test)
#print(error_RF)

In [29]:
#error term after OH encoding categorical variables fuelType, vehicleType, gearbox
#dropped columns name, seller, brand, model
#possibly ordinal encode brand and model 
#error_OH_encode = xgboost_mse(OH_x_train, OH_x_test, y_train, y_test)
#print(error_OH_encode)

In [30]:
### Potential Improvements to the Model ###
# 1. K means Cluster
# 2. PCA
# 3. Target Encode the 'model' variable 


In [31]:
###---  PCA  ---###
#print(numerical_cols)
pca_x_train = OH_x_train.copy()
pca_x_test = OH_x_test.copy()
pca_numerical_cols = ['yearOfRegistration', 'powerPS', 'kilometer', 'monthOfRegistration', 'postalCode']

print("Correlation with price:\n")
print(pca_x_train[pca_numerical_cols].corrwith(y_train))

Correlation with price:

yearOfRegistration     0.038544
powerPS                0.135953
kilometer             -0.284895
monthOfRegistration    0.077297
postalCode             0.056962
dtype: float64


In [32]:
#defining function to apply pca 
def apply_pca(x):
    #normalize
    x = (x-x.mean(axis=0)) / x.std(axis = 0)
    #run pca
    pca = PCA()
    x_pca = pca.fit_transform(x)
    #to df
    names = [f"PC{i+1}" for i in range(x.shape[1])]
    x_pca = pd.DataFrame(x_pca, columns = names)
    return x_pca

In [33]:
pca_scores_x_train = pca_x_train.loc[:, pca_numerical_cols]
pca_scores_x_test = pca_x_test.loc[:, pca_numerical_cols]

pca_scores_x_train = apply_pca(pca_scores_x_train)
pca_scores_x_test = apply_pca(pca_scores_x_test)

In [34]:
#joining pca scores
pca_scores_x_train['index'] = pca_x_train.index
pca_scores_x_test['index'] = pca_x_test.index

pca_scores_x_train = pca_scores_x_train.set_index('index') 
pca_scores_x_test = pca_scores_x_test.set_index('index')

pca_x_train = pca_x_train.join(pca_scores_x_train)
pca_x_test = pca_x_test.join(pca_scores_x_test)

In [35]:
#pca_x_train.isna().sum()

In [36]:
#error_pca = xgboost_mse(pca_x_train, pca_x_test, y_train, y_test)
#error_pca

In [37]:
###--- K-Means ----###
kmeans_x_train = OH_x_train.copy()
kmeans_x_test = OH_x_test.copy()

kmeans_features = ['yearOfRegistration', 'powerPS', 'kilometer', 'monthOfRegistration', 'postalCode']

kmeans_x_train = kmeans_x_train.loc[:,kmeans_features] 
kmeans_x_test = kmeans_x_test.loc[:,kmeans_features]

#normalize
def normalize(x):
    x = (x-x.mean(axis=0)) / x.std(axis = 0)
    return x

In [38]:
## seeing if normalizing data provides better results 
normalized_x_train = normalize(kmeans_x_train)
normalized_x_test = normalize(kmeans_x_test)

norm_cols = normalized_x_train.columns

norm_x_train = OH_x_train.drop(norm_cols, axis = 1)
norm_x_test = OH_x_test.drop(norm_cols, axis = 1)


norm_x_train = norm_x_train.join(normalized_x_train)
norm_x_test = norm_x_test.join(normalized_x_test)

#error_num_norm = xgboost_mse(norm_x_train, norm_x_test, y_train, y_test)
#error_num_norm

In [39]:
##K-Means
kmeans_x_train = normalize(kmeans_x_train)
kmeans_x_test = normalize(kmeans_x_test)

In [40]:
kmeans = KMeans(n_clusters=8, n_init=8, random_state=0)
kmeans_x_train['Cluster'] = kmeans.fit_predict(kmeans_x_train)
kmeans_x_test['Cluster'] = kmeans.fit_predict(kmeans_x_test) 

In [41]:
kmeans_x_train = OH_x_train.join(kmeans_x_train['Cluster'])
kmeans_x_test = OH_x_test.join(kmeans_x_test['Cluster'])

In [42]:
#error_kmeans = xgboost_mse(kmeans_x_train, kmeans_x_test, y_train, y_test)
#error_kmeans

In [43]:
#model = XGBRegressor(n_estimators=500)
#mse_scores = cross_val_score(model, OH_x_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [44]:
### ----- Target Encoding ----- ###
#encoding split
encode_x_train = x_train1.sample(frac = 0.20, random_state=0)
encode_y = y_train[encode_x_train.index]

pretrain_x_train = x_train1.drop(encode_x_train.index)
target_y_train = y_train[pretrain_x_train.index]

#encoder fitting
encoder = MEstimateEncoder(cols=['model'], m=3.0)
encoder.fit(encode_x_train,encode_y)

#encoder transforming
target_x_train = encoder.transform(pretrain_x_train)
target_x_test = encoder.transform(x_test1)

#choosing
numerical_cols = [col for col in target_x_train.columns if target_x_train[col].dtype in ['int64','float64']]
categorical_cols = [col for col in target_x_train.columns if target_x_train[col].dtype == 'object']

all_cols = numerical_cols + categorical_cols

#manual OneHotEncoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
tar_cols_train = pd.DataFrame(OH_encoder.fit_transform(target_x_train[categorical_cols]))
tar_cols_test = pd.DataFrame(OH_encoder.transform(target_x_test[categorical_cols]))

tar_cols_train.index = target_x_train.index
tar_cols_test.index = target_x_test.index

num_tar_x_train = target_x_train.drop(categorical_cols, axis = 1)
num_tar_x_test = target_x_test.drop(categorical_cols, axis = 1)

tarOH_x_train = pd.concat([num_tar_x_train, tar_cols_train], axis = 1)
tarOH_x_test = pd.concat([num_tar_x_test, tar_cols_test], axis = 1)

In [45]:
#target_error = xgboost_mse(tarOH_x_train, tarOH_x_test, target_y_train, y_test)

In [46]:
#target_error

In [47]:
def avg_cross_val(model, x, y):
    #x = pd.concat([x_train, x_test], axis = 0)
    #y = pd.concat([y_train, y_test], axis = 0)
    scores = cross_val_score(model, x, y, cv=3, scoring='neg_mean_squared_error')
    scores = - scores
    avg = scores.mean()
    return avg

In [48]:
def avg_crossval(model, x_train, x_test, y_train, y_test):
    x = pd.concat([x_train, x_test], axis = 0)
    y = pd.concat([y_train, y_test], axis = 0)
    scores = cross_val_score(model, x, y, cv=3, scoring='neg_mean_squared_error')
    scores = - scores
    avg = scores.mean()
    return avg

In [49]:
model = model = XGBRegressor(n_estimators=500)
print('Avg. Cross Validation MSE Scores:\n')
print('One Hot Encoding Only:')
#print(avg_crossval(model,OH_x_train, OH_x_test, y_train, y_test))
print('\nOH & PCA:')
#print(avg_crossval(model, pca_x_train, pca_x_test, y_train, y_test))
print('\nOH & K-Means:')
#print(avg_crossval(model, kmeans_x_train, kmeans_x_test, y_train, y_test))
print('\nOH & Target Encoding:')
#print(avg_crossval(model, tarOH_x_train, tarOH_x_test, target_y_train, y_test))

Avg. Cross Validation MSE Scores:

One Hot Encoding Only:

OH & PCA:

OH & K-Means:

OH & Target Encoding:


In [50]:
#### FINAL PREDICTION OUTPUT ####
#obtaining prediction prices for submission using 'test'
x_real = test.drop(bad_cols, axis = 1)

In [51]:
#parsing dateCreated
x_real['dateCreated'] = x_real['dateCreated'].apply(lambda x: x.lstrip('0'))
x_real['dateCreated'] = pd.to_datetime(x_real['dateCreated'], format='%m/%d/%y')

#changing date into 3 separate columns of month, day, year as int
x_real['date_month'] = x_real['dateCreated'].dt.month
x_real['date_day'] = x_real['dateCreated'].dt.day
x_real['date_year'] = x_real['dateCreated'].dt.year

x_real = x_real.drop(['dateCreated'], axis = 1)

In [52]:
x_real.head()

Unnamed: 0,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,postalCode,date_month,date_day,date_year
0,bus,2004,manuell,128,andere,150000,3,diesel,peugeot,nein,85764,4,5,2016
1,coupe,2001,automatik,163,c_klasse,150000,5,benzin,mercedes_benz,,21149,3,8,2016
2,limousine,2009,automatik,160,golf,30000,2,benzin,volkswagen,nein,58089,3,21,2016
3,kleinwagen,2003,manuell,64,polo,150000,5,benzin,volkswagen,nein,49685,3,25,2016
4,suv,2004,manuell,113,santa,150000,6,diesel,hyundai,nein,54472,3,13,2016


In [68]:
### ---- Target Encoding 'model' ---- ###
#***************************************#

    #---PRODUCED BEST RESULTS---#

#encoder transforming
target_x_real = encoder.transform(x_real)


#separating cols 
numerical_cols = [col for col in target_x_real.columns if target_x_real[col].dtype in ['int64','float64']]
categorical_cols = [col for col in target_x_real.columns if target_x_real[col].dtype == 'object']

all_cols = numerical_cols + categorical_cols

#manual OneHotEncoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
tar_cols_real = pd.DataFrame(OH_encoder.fit_transform(target_x_real[categorical_cols]))

tar_cols_real.index = target_x_real.index

num_tar_x_real = target_x_real.drop(categorical_cols, axis = 1)

tarOH_x_real = pd.concat([num_tar_x_real, tar_cols_real], axis = 1)

In [69]:
#XGBoost model 
model = XGBRegressor(n_estimators=500)
model.fit(tarOH_x_train, target_y_train, 
          early_stopping_rounds=5,
          eval_set=[(tarOH_x_test,y_test)],
          verbose=False)

predictions = model.predict(tarOH_x_real)



In [71]:
#defining numerical and categorical cols 
#numerical_cols = [col for col in x_train.columns if x_train[col].dtype in ['int64','float64']]
#categorical_cols = [col for col in x_train.columns if x_train[col].dtype == 'object']

#manual OneHotEncoding
#OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
#OH_cols_real = pd.DataFrame(OH_encoder.fit_transform(x_real[categorical_cols]))

#OH_cols_real.index = x_real.index

#num_x_real = x_real.drop(categorical_cols, axis = 1)

#OH_x_real = pd.concat([num_x_real, OH_cols_real], axis = 1)

#XGBoost model 
#model = XGBRegressor(n_estimators=500)
#model.fit(OH_x_train, y_train,
#          early_stopping_rounds=5,
#          eval_set=[(OH_x_test,y_test)],
#          verbose=False)

#predictions = model.predict(OH_x_real)

In [56]:
def toCSV(pred, file_name):
    ids = np.array(list(range(1,10001)))
    output = output = pd.DataFrame({'Id' : ids, 'Predicted' : pred})
    output.to_csv(file_name, index = False)

In [57]:
#toCSV(predictions,'submission3.csv')

In [58]:
### XGBOOST Hyperparameter Tuning ###
#from sklearn.model_selection import GridSearchCV

#XGBoost model 
#model = XGBRegressor()

#parameters
#parameters = {
#    'learning_rate' : [0.01,0.05],
#    'n_estimators' : [250,500,1000],
#    'max_depth' : [6,8]
#    #'subsample' : [0.9,0.7]
#}

#grid = GridSearchCV(estimator=model, param_grid=parameters, scoring='neg_mean_squared_error', verbose=1)

#grid.fit(tarOH_x_train, target_y_train)


#print("Best parameters:", grid.best_params_)

#*************************************************************#
# --------- Output -----------#
#Fitting 5 folds for each of 12 candidates, totalling 60 fits
#Best parameters: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 500}

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 500}


In [72]:
#XGBoost model with new params
model = XGBRegressor( learning_rate = 0.05, max_depth=8, n_estimators= 500)
model.fit(tarOH_x_train, target_y_train, 
          early_stopping_rounds=5,
          eval_set=[(tarOH_x_test,y_test)],
          verbose=False)
#predictions = model.predict(OH_x_real)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [73]:
predictions = model.predict(tarOH_x_real)
#toCSV(predictions,'submission4.csv')

In [59]:
###--- K-Means ----###
#kmeans_x_real = OH_x_real.copy()
#kmeans_features = ['yearOfRegistration', 'powerPS', 'kilometer', 'monthOfRegistration', 'postalCode']

#kmeans_x_real = kmeans_x_real.loc[:,kmeans_features] 

##K-Means
#kmeans_x_real = normalize(kmeans_x_real)



In [60]:
#kmeans = KMeans(n_clusters=8, n_init=8, random_state=0)
#kmeans_x_real['Cluster'] = kmeans.fit_predict(kmeans_x_real)

#kmeans_x_real = OH_x_real.join(kmeans_x_real['Cluster'])