<a href="https://colab.research.google.com/github/youssefhesham200/PCA-and-Fuzzy-C-Means-from-scratch/blob/master/PCA_and_Fuzzy_CMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score

In [None]:
train_data = pd.read_csv('E:\\datasets\\titanic.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Drop columns that are not used in training data
train_unlabel = train_data.drop(['PassengerId','Name','Ticket','Cabin', 'Survived'], axis=1)
train_unlabel.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [None]:
y_true = train_data['Survived']

In [None]:
train_unlabel.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [None]:
train_unlabel['Embarked'].value_counts()


S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
train_unlabel['Embarked'] = train_unlabel['Embarked'].fillna('S')


In [None]:
train_unlabel['Age'] = train_unlabel['Age'].fillna(train_unlabel['Age'].mean())


In [None]:
train_unlabel.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [None]:
train_unlabel = pd.get_dummies(train_unlabel, drop_first=True)


In [None]:
train_unlabel.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


In [None]:
scaler = FunctionTransformer(np.log1p)
train_unlabel = scaler.transform(train_unlabel)

In [None]:
train_unlabel.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1.386294,3.135494,0.693147,0.0,2.110213,0.693359,0.0,0.693359
1,0.693147,3.663562,0.693147,0.0,4.280593,0.0,0.0,0.0
2,1.386294,3.295837,0.0,0.0,2.188856,0.0,0.0,0.693359
3,0.693147,3.583519,0.693147,0.0,3.990834,0.0,0.0,0.693359
4,1.386294,3.583519,0.0,0.0,2.202765,0.693359,0.0,0.693359


In [None]:
train_unlabel = np.array(train_unlabel)

In [None]:
train_unlabel.shape

(891, 8)

In [None]:
#parameters --> any number of components less than features.

def pca(train_unlabel, n_components):
   
    # implement pca to reduce features to 3
    X_meaned = train_unlabel - np.mean(train_unlabel, axis = 0)
    n_features = train_unlabel.shape[1]
    
    #calc covarience matrix
    sumi = np.zeros((n_features, n_features))
    for i in range(X_meaned.shape[0]):
        sumi = np.add(sumi, np.dot(X_meaned[i, :].reshape(n_features,-1), X_meaned[i, :].reshape(n_features,-1).T))

    cov_mat = sumi / X_meaned.shape[0]
    
    #calc eigen values and eigen vectors from covarience matrix
    eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)

    #sort the eigenvalues in descending order
    sorted_index = np.argsort(eigen_values)[::-1]

    #sorted_eigenvalue = eigen_values[sorted_index]

    #similarly sort the eigenvectors 
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
    
    eigenvector_subset = sorted_eigenvectors[:,0:n_components]
    
    print(eigenvector_subset.shape)
    
    X_projected = np.dot(eigenvector_subset.transpose(),X_meaned.transpose()).transpose() #3 * 8 * 8 * 800 .T = 891 * 3

    return X_projected
    

In [None]:
# implement fuzzy c_means

class fuzzy_c():
    
    # 1 - init membership (no of samples * no of clusters)
    
    #parametars is our data , number of clusters , g for level of cluster fuzziness
    def __init__(self, train_data, culsters , g):
        self.membership =[]
        self.centroids = []
        self.distances = []
        self.train_data = train_data
        self.culsters = culsters
        self.g = g
        
        for i in range(len(train_data)):
            self.membership.append(np.random.dirichlet(np.ones(culsters),size=1))
        
        self.membership = np.array(self.membership).reshape(len(train_data), self.culsters)
    
    
    def compute_centriod(self):
        self.centroids = []
        centroid = []
        for col_clus in range(self.culsters):
            b = np.sum(np.power(self.membership[: , col_clus], self.g))
            centroid = []
            for col_feuture in range(self.train_data.shape[1]):
                a = np.dot(np.power(self.membership[: , col_clus], self.g) ,  self.train_data[:, col_feuture])
                centroid.append(a / b)
            self.centroids.append(centroid)
        
        self.centroids = np.array(self.centroids)

        
    def compute_distinces(self):
        self.distances = []
        for point in range(len(self.train_data)):
            dist = []
            
            for clus in range(self.culsters):
                dist.append(euclidean(self.train_data[point, :], self.centroids[clus]))
            
            self.distances.append(dist)
        
        self.distances = np.array(self.distances)
        
    
    def update_membership(self):
        p = 2 / (self.g-1)
        
        for i in range(len(self.membership)):
            for j in range(self.membership.shape[1]):
                self.membership[i, j] = 1 / np.sum(np.power(self.distances[i,j] / self.distances [i , :], p)) 
            
    
    # determine number of iterations 
    def fit(self, iterations):
        for i in range(iterations):
            self.compute_centriod()
            self.compute_distinces()
            self.update_membership()
        
        return self.membership
        
    # map max membership value to its cluster    
    def pridect(self):
        pridection = []
        for i in range(len(self.membership)):
            pridection.append(np.argmax(list(self.membership[i,:])))
        
        return pridection
        
        

In [None]:
# train_unlabel is a full dataset 8 features
fuzzy_c_means1 = fuzzy_c(train_unlabel, 2, 2)

#returned final memberships
mem1 = fuzzy_c_means1.fit(10)

#returned final pridections
pridection1 = fuzzy_c_means1.pridect()

In [None]:
# accuracy_score for full dataset pridections and true labels 
accuracy_score(pridection1, list(train_data['Survived']))

0.6531986531986532

In [None]:
#apply pca to reduce features to just 3 

pca_data = pca(train_unlabel, 3)

(8, 3)


In [None]:
pca_data.shape

(891, 3)

In [None]:
# apply same fuzzy c_means to reduced data

fuzzy_c_means2 = fuzzy_c(pca_data, 2, 2)

mem2 = fuzzy_c_means2.fit(10)

pridection2 = fuzzy_c_means2.pridect()

In [None]:
# accuracy_score for reduced dataset pridections and true labels 
accuracy_score(pridection2, list(train_data['Survived']))

0.6554433221099888