In [None]:
from sklearn.datasets import load_boston

data = load_boston()
data

In [None]:
import pandas as pd
df = pd.DataFrame(data.data, columns=data.feature_names)
df

In [None]:
df.corr()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, cmap = plt.cm.CMRmap_r)
plt.show()

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df, 0.7)
corr_features

In [None]:
df.drop(corr_features, axis=1)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('iris-kmeans.csv')
df.head()

In [None]:
df.info()

In [None]:
from sklearn.cluster import KMeans

distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
    distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
from sklearn.cluster import KMeans

features = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
model = KMeans(n_clusters= 3, init='k-means++')
model.fit(features)

In [None]:
xx = model.labels_.tolist()
xx

In [None]:
df['label'] = xx
df

In [None]:
fig, axes = plt.subplots(1,2, figsize = (16,8))

axes[0].scatter(df['PetalLengthCm'], df['PetalWidthCm'], c=df['Species-code'])
axes[0].set_title('Actual')

axes[1].scatter(df['PetalLengthCm'], df['PetalWidthCm'], c=df['label'])
axes[1].set_title('KMean')

In [None]:
fig, axes = plt.subplots(1,2, figsize = (16,8))

axes[0].scatter(df['SepalLengthCm'], df['SepalWidthCm'], c=df['Species-code'])
axes[0].set_title('Actual')

axes[1].scatter(df['SepalLengthCm'], df['SepalWidthCm'], c=df['label'])
axes[1].set_title('KMean')

In [None]:
model.predict([[3.8,	2.8,	1.6,	2.4	]])

In [None]:
from sklearn.metrics import confusion_matrix

y_true = df['Species-code']
y_pred = df['label']

print(confusion_matrix(y_true, y_pred))

In [None]:
pd.set_option('display.max_rows', None)
df