In [None]:
Name : Omkar Hulawale
Roll_no : 14153
Batch : A3

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load dataset
df = pd.read_csv("sales_data_sample.csv", encoding="Latin-1")

# Display initial rows
df.head()

In [None]:
# Shape of dataset
df.shape

In [None]:
# Summary statistics
df.describe()

In [None]:
# Information about dataset
df.info()

In [None]:
# Check missing values
df.isnull().sum()

In [None]:
# Drop unnecessary columns
df_drop = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS', 'POSTALCODE', 'CITY']
df = df.drop(df_drop, axis=1)

# Verify missing values after dropping
df.isnull().sum()

In [None]:
# Check datatypes
df.dtypes

In [None]:
# Unique values in important categorical columns
print(df['COUNTRY'].unique())
print(df['PRODUCTLINE'].unique())
print(df['DEALSIZE'].unique())

In [None]:
# One-hot encoding for categorical variables
productline = pd.get_dummies(df['PRODUCTLINE'])
Dealsize = pd.get_dummies(df['DEALSIZE'])

df = pd.concat([df, productline, Dealsize], axis=1)

# Drop original categorical columns
df_drop = ['COUNTRY', 'PRODUCTLINE', 'DEALSIZE']
df = df.drop(df_drop, axis=1)

# Encode PRODUCTCODE
df['PRODUCTCODE'] = pd.Categorical(df['PRODUCTCODE']).codes

# Drop ORDERDATE column
df.drop('ORDERDATE', axis=1, inplace=True)

df.dtypes

In [None]:
# Elbow Method for optimal clusters
distortions = []
K = range(1, 10)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=42)
    kmeanModel.fit(df)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('K')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# Train KMeans model with 3 clusters
x_train = df.values
print(x_train.shape)

model = KMeans(n_clusters=3, random_state=2)
model = model.fit(x_train)
predictions = model.predict(x_train)

# Cluster counts
unique, counts = np.unique(predictions, return_counts=True)
counts_df = pd.DataFrame(counts.reshape(1,3), columns=['Cluster1','Cluster2','Cluster3'])
counts_df.head()

In [None]:
# PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_X = pd.DataFrame(pca.fit_transform(x_train), columns=['PCA1','PCA2'])
reduced_X.head()