# Hierarchical Agglomerative Clustering

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler   
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
%matplotlib inline

In [2]:
data= pd.read_csv("../input/frauddet/v1_1.csv")

In [3]:
data.head()

In [4]:
#check for null values
data.isnull().values.any()

In [5]:
#sum of all the null values in each column and sort in ascending order
data.isnull().sum().sort_values(ascending = False)

## Data Cleaning

In [6]:
#find percentage of the missing values in each column
perc_missing_val=(data.isnull().sum() / data.shape[0]).sort_values(ascending = False)
perc_missing_val

In [7]:
#drop unnecessary columns not required for  fraud detection
data.drop(['Film Title','superstar hero','OTT availability'],axis = 1, inplace = True) 

In [8]:
# data['Superhit, Hit or Flop'].replace(['Superhit','Hit', 'Flop'],[2,1,0])
# data['Holiday Week Release'].replace(['no','yes'],[0,1])
# data['OTT/Theatre Release'].replace(['OTT',' Theatre','Theatre '],[0,1,1])


In [9]:
# encode categorical variables to numerical variables
to_encode = ['Superhit, Hit or Flop','Genre','Holiday Week Release','OTT/Theatre Release']
encoder = LabelEncoder()
data[to_encode] = data[to_encode].apply(encoder.fit_transform)

In [10]:
#check data
data.head()

In [11]:
data.info()

Converting string to float

In [12]:
data['Box Office Collection (in Crores)'] = pd.to_numeric(data['Box Office Collection (in Crores)'],errors = 'coerce')

In [13]:
data['Profit/Loss(INR) (in Crores)'] = pd.to_numeric(data['Profit/Loss(INR) (in Crores)'],errors = 'coerce')

In [14]:
#Recheck for null values
data.isnull().values.any()

In [15]:
data.isnull().sum().sort_values(ascending = False)

In [16]:
#fill the null values with median values of the data in the columns

data['Profit/Loss(INR) (in Crores)'].fillna(data['Profit/Loss(INR) (in Crores)'].median(), inplace= True)
data['Box Office Collection (in Crores)'].fillna(data['Box Office Collection (in Crores)'].median(), inplace = True)
data['Profit/Loss Margin'].fillna(data['Profit/Loss Margin'].median(), inplace = True)
data['Budget ( in Cr)'].fillna(data['Budget ( in Cr)'].median(), inplace = True)

In [17]:
#finally no null values. Parameters are ready for fraud detection modelling
data.isnull().sum().sort_values(ascending = False)

In [18]:
data.head()

### To display our data on a graph at a later point, we can only take two variables (IMDB Rating and OTT/Theatre Release).

In [19]:
X = data.iloc[:, [8,9]].values
# X

In [20]:
a = data.iloc[:, [8]].values
b = data.iloc[:, [9]].values

In [21]:
sns.scatterplot(data=data, x='IMDB rating ', y='OTT/Theatre Release')


In [22]:
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))#IMDB vs release type

Looking at the dendrogram, the highest vertical distance that doesn’t intersect with any clusters is the middle orange one. Given that 4 vertical lines cross the threshold, the optimal number of clusters is 4.

We create an instance of AgglomerativeClustering using the euclidean distance as the measure of distance between points and ward linkage to calculate the proximity of clusters.

In [23]:
model = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
model.fit(X)
labels = model.labels_

In [24]:
labels

In [27]:
plt.scatter(X[labels==0, 0], X[labels==0, 1], s=50, marker='o', color='red')
plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue')
plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green')
plt.scatter(X[labels==3, 0], X[labels==3, 1], s=50, marker='o', color='purple')
plt.xlabel("IMDB rating")
plt.ylabel("OTT/Theatre Release")

plt.show()
