Recreating an example to gain understanding for KMeans model

In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.cluster import KMeans 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, recall_score
%matplotlib inline

train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

# print("***** Train_Set *****")
# print(train.head())
# print("\n")
# print("***** Test_Set *****")
# print(test.head())

# print("***** Train_Set *****")
# print(train.describe())
# print("\n")
# print("***** Test_Set *****")
# print(test.describe())

# print(train.columns.values)
train.isna().head()
test.isna().head()

# print("***** In the train set *****")
# print(train.isna().sum())
# print("\n")
# print("***** In the test set *****")
# print(test.isna().sum())

train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

# print(train.isna().sum())
# print(test.isna().sum())

train['Ticket'].head()
train['Cabin'].head()
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

# g = sns.FacetGrid(train, col='Survived')
# g.map(plt.hist, 'Age', bins=20)

# grid = sns.FacetGrid(train, col='Survived', row='Pclass', size=2.2, aspect=1.6)
# grid.map(plt.hist, 'Age', alpha=.5, bins=20)
# grid.add_legend(); 

# train.info()

train = train.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
test = test.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

labelEncoder = LabelEncoder()
labelEncoder.fit(train['Sex'])
labelEncoder.fit(test['Sex'])
train['Sex'] = labelEncoder.transform(train['Sex'])
test['Sex'] = labelEncoder.transform(test['Sex'])

# train.info()

# test.info()

X = np.array(train.drop(['Survived'], 1).astype(float))
y = np.array(train['Survived'])
# train.info()


# K-Means Model 

kmeans = KMeans(n_clusters=2, max_iter=600, algorithm='auto')
kmeans.fit(X)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
kmeans.fit(X_scaled)

correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = kmeans.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1

print(correct/len(X))

0.37373737373737376


Predicting with KMeans model

In [8]:
# predicting with KMeans model with validation set

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans 

# reading data file:
data_dataframe = pd.read_csv('../TOCOM.csv', sep=';') 

# dropping NaN values: 
data_dataframe.dropna(subset = ["Gender"], inplace=True)
data_dataframe.dropna(subset = ["Leeftijd"], inplace=True)
data_dataframe.dropna(subset = ["AST"], inplace=True)
data_dataframe.dropna(subset = ["Length_cm"], inplace=True)
data_dataframe.dropna(subset = ["Weight"], inplace=True)
data_dataframe.dropna(subset = ["BMI"], inplace=True)
data_dataframe.dropna(subset = ["MQ"], inplace=True)
data_dataframe.dropna(subset = ["MQ_category"], inplace=True)

# dropping colums:
data_dataframe_modified = data_dataframe.drop(data_dataframe.columns[[0, 2, 3, 4, 6, 7, 9, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22,23,24,25,26,27,28,29,30,31,32,33,34, 35, 36]], axis=1)

# printing head of the dataframe: 
print(data_dataframe_modified.head())

# setting X and y: 
X = data_dataframe_modified.iloc[:, :-1].values
y = data_dataframe_modified["MQ_category"].values

# splitting dataframe in train, test and validation sets:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2 ,random_state=11111 , stratify = y) 
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=11111)

# KMeans model: 
kmeans_model = KMeans(n_clusters=3, max_iter=6)

# using KMeans model classifier in order to fit the data:
kmeans_model.fit(X_train, y_train)

# predicting y-data with classifier:
y_predict = kmeans_model.predict(X_test)

# getting the scores for the X_test and y_test subsets:
kmeans_model.score(X_test, y_test)

# printing the scores:
print('Score for training: {}'.format(kmeans_model.score(X_train, y_train)))
print('Score for testing: {}'.format(kmeans_model.score(X_test, y_test)))

# Printing the confusion matrix:
confusionMatrix = confusion_matrix(y_test, y_predict)
print(confusionMatrix)

   Gender  Leeftijd  MQ_category  Length_cm  Weight    BMI
0     0.0      4.55          3.0      111.5    19.8  15.93
5     0.0      5.86          4.0      114.5    19.9  15.18
6     0.0      5.83          4.0      115.1    22.7  17.13
8     1.0      4.43          3.0      115.9    22.3  16.60
9     0.0      5.67          2.0      109.7    15.9  13.21
Score for training: -23655.384736118136
Score for testing: -96327698.19398567
[[ 0  0  0  0  0  0]
 [16  0  9  0  0  0]
 [21  0 18  0  0  0]
 [77  1 67  0  0  0]
 [33  0 29  0  0  0]
 [26  0 11  0  0  0]]


In [9]:
# With scaling 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# reading data file:
data_dataframe = pd.read_csv('../TOCOM.csv', sep=';') 

# dropping colums:
data_dataframe_modified = data_dataframe.drop(data_dataframe.columns[[0, 2, 3, 4, 6, 7, 9, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22,23,24,25,26,27,28,29,30,31,32,33,34, 35, 36]], axis=1)

# dropping NaN values: 
data_dataframe.dropna(subset = ["Gender"], inplace=True)
data_dataframe.dropna(subset = ["Leeftijd"], inplace=True)
data_dataframe.dropna(subset = ["AST"], inplace=True)
data_dataframe.dropna(subset = ["Length_cm"], inplace=True)
data_dataframe.dropna(subset = ["Weight"], inplace=True)
data_dataframe.dropna(subset = ["BMI"], inplace=True)
data_dataframe.dropna(subset = ["MQ"], inplace=True)
data_dataframe.dropna(subset = ["MQ_category"], inplace=True)

# calculating the median of the dataframe: 
data = data_dataframe_modified.fillna(data_dataframe_modified.median())

# printing head of the dataframe: 
print(data_dataframe_modified.head())

# setting a variable data:
N = 2
data = data.iloc[:, :-N]

# setting y variable: 
y = data["MQ_category"].values.astype(int)

# scaling the data:
scaler = MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
scaled_data

# setting X variable: 
X = scaled_data.drop(scaled_data.columns[[2]], axis= 1)

# splitting dataframe in train, test and validation sets:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11111, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size= 0.25, random_state = 11111)

# KMeans model: 
kMeans_model = KMeans(n_clusters = 3, init = 'k-means++', n_init = 10, max_iter = 100)

# using KMeans model classifier in order to fit the data:
kMeans_model.fit(X_train, y_train)

# predicting y-data with classifier:
y_predict = kMeans_model.predict(X_test)

# getting the scores for the X_test and y_test subsets:
testing_score = kMeans_model.score(X_test, y_test)
print(f"Score of testing: {testing_score}")

# Printing the confusion matrix:
confusionMatrix = confusion_matrix(y_test, y_predict)
print(confusionMatrix)


   Gender  Leeftijd  MQ_category  Length_cm  Weight    BMI
0     0.0      4.55          3.0      111.5    19.8  15.93
1     1.0      5.01          NaN      112.5    17.3  13.67
2     0.0      5.28          NaN      110.9    21.0  17.07
3     0.0      5.79          NaN      105.7    18.2  16.29
4     1.0      4.73          NaN      116.0    22.4  16.65
Score of testing: -1.4390655593080706
[[ 0  0  0  0  0  0]
 [ 8 15  2  0  0  0]
 [11 18 11  0  0  0]
 [35 90 51  0  0  0]
 [17 20 26  0  0  0]
 [ 6 21 11  0  0  0]]
