In [None]:
import numpy as np
import pandas as pd
import missingno
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
%matplotlib inline

## Data Inspection

In [None]:

dataPath="sample_data.csv"
df=pd.read_csv(dataPath)
df.head()

In [None]:
missingdata_df = df.columns[df.isnull().any()].tolist()
missingno.matrix(df[missingdata_df])

## Data Preprocessing

In [None]:



##-1- Delete empty columns
cols_to_delete = df.columns[df.isnull().sum()/len(df) > .90]
df.drop(cols_to_delete, axis = 1, inplace = True)

##-2- Removing redundant data

corr_matrix = df.corr().abs() # Create correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Select upper triangle of correlation matrix
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] # Find features with correlation greater than 0.95
df.drop(to_drop, axis=1, inplace=True)# Drop features 


##--3- Replacing missing values in columns 
ind=[]
for i,col in enumerate(df.columns):
    if (df[col].isnull().values.any()):
        # print("Missing values Found ")
        ind.append(i)

for i in ind:
    if df[df.columns[i]].isnull().values.any():
        df[df.columns[i]]=df[df.columns[i]].fillna(df[df.columns[i]].mean())
#         print("missing value replaced")


##--3- Removing constant columns 
df=df.loc[:, (df!= df.iloc[0]).any()] ## removing constant columns


##--4- Converting catagorical data to numerical  
df["ref_group"] = df["ref_group"].astype('category')
d = dict(enumerate(df["ref_group"].cat.categories))
print (d)
df["ref_group"] =df["ref_group"].cat.codes



# df.to_csv("processed_data.csv")

##	Visualization of natural trend in dataset

In [None]:
X=df.loc[:,df.columns != "ref_group"]
Y=df.loc[:,df.columns == "ref_group"]

In [None]:
scaler = StandardScaler()
scaler.fit(X)
scaled_data = scaler.transform(X)
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=df['ref_group'],cmap='plasma')
plt.title("Overall data visualization")
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

# Unsupervised clustering

### 1 - Elbow method for finding optimal number of clusters 

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df)
    distortions.append(kmeanModel.inertia_)

plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
scaled_df=X.copy()

#created scaled version of DataFrame
scaled_df=pd.DataFrame(scaler.fit_transform(scaled_df), columns=scaled_df.columns)
#define PCA model to use
pca = PCA(n_components=10)

#fit PCA model to data
pca_fit = pca.fit(scaled_df)
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

### 2- K-mean Clustering

In [None]:
#Load Data
data = X
y=df.loc[:, df.columns == "ref_group"]
Y=y.to_numpy().flatten()
pca = PCA(2)
 
#Transform the data
df1 = pca.fit_transform(data)
 
#Import KMeans module
from sklearn.cluster import KMeans
 
#Initialize the class object
i=6
kmeans = KMeans(n_clusters= i)
 
#predict the labels of clusters.
label = kmeans.fit_predict(df1)
# label= Y=y.to_numpy().flatten()
#Getting unique labels
u_labels = np.unique(label)
 
for i in range(i):
    cat = (label == i)
    label[cat] = mode(Y[cat])[0]
    
acc = accuracy_score(Y, label)
print("Accuracy using GMM = ", acc)    
    
#plotting the results:
for i in u_labels:
    plt.scatter(df1[label == i , 0] , df1[label == i , 1] , label = d[i])
plt.legend(bbox_to_anchor=(100.05, 10.0), loc='upper left')
# plt.legend()
plt.legend()
plt.show()

centroids = kmeans.cluster_centers_
u_labels = np.unique(label)
 
#plotting the results:
 
for i in u_labels:
    plt.scatter(df1[label == i , 0] , df1[label == i , 1] , label = d[i])
plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.legend(loc='best', bbox_to_anchor=(0.5, 0., 0.5, 0.5))
# plt.legend()
plt.tight_layout()
plt.show()

##  Semi_Supervise Learning

In [None]:
XL=labeled_data=df.loc[df['ref_group'] != 6]  ## labelled examples
XUL=unlabeled_data=df.loc[df['ref_group'] == 6]  ## unlabelled examples

## spliting into training and test sets
XL_train=XL.iloc[0:2414,1:]
YL_train=XL.iloc[0:2414,0]

XL_test=XL.iloc[2414:,1:]
YL_test=XL.iloc[2414:,0]

XUL_train=XUL.iloc[:,1:]
YUL_train=XUL.iloc[:,0]

## training model
clf = svm.SVC(kernel='linear', probability=True,C=1).fit(XL_train, YL_train)
clf.score(XL_test, YL_test)

## probability of new class labels 
clp= clf.predict_proba(XUL_train)
lab=clf.predict(XUL_train)
dfl = pd.DataFrame(clp, columns = ['C1Prob', 'C2Prob','C3Prob','C4Prob', 'C5Prob','C6Prob']) 
dfl['lab']=lab
dfl['actual']=YUL_train
dfl['max']=dfl[['C1Prob', 'C2Prob','C3Prob','C4Prob', 'C5Prob','C6Prob']].max(axis=1)

In [None]:
# calculate accuracy
from sklearn import metrics

y_pred_class=clf.predict(XL_test)

cm=metrics.confusion_matrix(YL_test, y_pred_class)
print(metrics.confusion_matrix(YL_test, y_pred_class))

import seaborn as sns

ax = sns.heatmap(cm, annot=True, cmap='Blues')

ax.set_title('class prediction for labelled data\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
# ax.xaxis.set_ticklabels(['False','True'])
# ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
XL_train=XL_train.to_numpy()
XL_train.shape
XUL=XUL_train
XUL=XUL.values
nc=np.arange(.35,1,.03)
acc=np.empty(22)
i=0
for k in np.nditer(nc):
    conf_ind=dfl["max"]>k

    X_train1 = np.append(XL_train,XUL[conf_ind,:],axis=0)
    y_train1 = np.append(YL_train,dfl.loc[conf_ind,['lab']])
    clf = svm.SVC(kernel='linear', probability=True,C=1).fit(X_train1, y_train1)
    acc[i]=  clf.score(XL_test,YL_test)
    i = i + 1

In [None]:
import matplotlib.pyplot as plt
plt.hist(dfl["max"])
plt.title('Histogram of max-porbabilty of class labels')
plt.xlabel('Max Probability belonging to a class')
plt.ylabel('No. of examples')
plt.show() 

In [None]:
import matplotlib.pyplot as plt
x=pd.Series(acc,index=nc)
x.plot()
# Add title and axis names
plt.title('Confidence vs Accuracy')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.show() 

## Label Propagation

In [None]:
clp2= clf.predict_proba(XUL)
lab2=clf.predict(XUL)
df2 = pd.DataFrame(clp2, columns = ['C1Prob', 'C2Prob','C3Prob','C4Prob', 'C5Prob','C6Prob']) 
df2['lab']=lab2

df2['max']=df2[['C1Prob', 'C2Prob','C3Prob','C4Prob', 'C5Prob','C6Prob']].max(axis=1)
df2.loc[df2['max'] < 0.65, 'lab'] = 6
unlabeled_data['ref_group']=df2['lab'].values

In [None]:
frames = [labeled_data,unlabeled_data]
df_new = pd.concat(frames)

## Clustering with newly labelled overall data

In [None]:
X=df_new.loc[:,df_new.columns != "ref_group"]
Y=df_new.loc[:,df_new.columns == "ref_group"]

pca = PCA(n_components=2)

pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

label=Y
u_labels = np.unique(label)
 
#plotting the results:

label=label.values.flatten()
f={0:'group_A',1:'group_B',2:'group_C',3:'group_D',4:'group_E',5:'group_X',6:'new_group'}
for i in u_labels:
    plt.scatter(x_pca[label == i , 0] , x_pca[label == i , 1] , label = f[i])
plt.legend()
plt.title('Visualization of dataset after labelling of unlabelled data')

plt.xlabel('First principal component of PCA')
plt.ylabel('Second Principal Component of PCA')
plt.show()