## Importing necessary python libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the data

In [None]:
data= pd.read_csv("food_rating.csv")

In [None]:
data.head()

## We can see that we have only numerical variables and emp_id is just a unique identification number of the employee. Lets set it as index

In [None]:
data.set_index('Emp Id',inplace = True)

### descriptive stats of the dataset

In [None]:
data.describe()

## Visualize the spread of the variables using a boxplot

In [None]:
data.boxplot(figsize = (12,8), grid = True)
plt.suptitle('BOXPLOTS TO CHECK THE SPREAD OF VARIABLES', size = 20)

## Question-1: Do you find any difference in the scale of the variables? Is there an exception? Write your observations from the boxplots.

## Lets check the co-relation matrix to understand the relationships

In [None]:
plt.figure(figsize = (10,8) )
ax = sns.heatmap(data.corr(),annot = True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

## Question-2: What patterns do you observe here? What insights can you draw from the exploratory data analysis so far?

In [None]:
from sklearn.preprocessing import MinMaxScaler

data_copy = data[:]
scaler = MinMaxScaler()
data_copy[data_copy.columns] = scaler.fit_transform(data_copy)

## Question-3: What did we do here? Why did we do it? 

### `Hopkins Statistic`

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def my_hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
my_hopkins(data_copy)

## Question-4: According to Hopkins Statistic is there cluster tendency in the data?

## Model building - KMeans

In [None]:
from sklearn.cluster import KMeans

### Choosing K by comparing Cost against each K¶

In [None]:
import pylab as pl
number_of_clusters = range(1,5)
kmeans = [KMeans(n_clusters=i,max_iter=1000,random_state=42) for i in number_of_clusters]
score = [-1*kmeans[i].fit(data_copy).score(data_copy) for i in range(len(kmeans))]
pl.plot((number_of_clusters),score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

## According to the Elbow Curve the optimum No of Clusters is 2. But, your management plans 3 different types of food items to give to the employees. Hence we will be running K-Means with number of clusters as 3

## Question-5: Write the code for creating the KMeans clusters with number of clusters = 3.

### Hint: Use the function KMeans(n_clusters= no. of clusters ,max_iter=1000,random_state=42)

In [None]:
km = #Write your code here
fitClusters = km.fit_predict(data_copy)

### Combining the predicted clusters with the original DF.¶

In [None]:
copy_2 = data_copy.reset_index()
clustersDf = pd.DataFrame(fitClusters)
data['k_mean_pred'] = fitClusters
clustersDf.columns = ['k_mean_pred']
combinedDf = pd.concat([copy_2, clustersDf], axis = 1)
combinedDf.set_index('Emp Id',inplace = True)
combinedDf.head()

## Analysing the segments based on the clusters created by K-means clustering Model

In [None]:
temp = ['K means clustering'] 
a = 0
for x in  ['k_mean_pred']:
    plt.figure(figsize = (15,8))
    c = 1
    for variables in  data.columns[:-2]:
        plt.subplot(2,3,c)
        sns.boxplot(x= x, y= variables , data= data)
        c +=1

        plt.subplots_adjust(wspace=0.50)
    plt.suptitle('Different Segments of Employees - ' + temp[a])
    a +=1

## Now that the segments have been created, we need to profile each segment for reporting

## Question-6: What are the major differences between Employee Segment 0 and Employee Segment 1?

In [None]:
# Write your answer here

## Question-7: Which of the employee segments does not show much interest in any kind of food item?

In [None]:
# Write your answer here