In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import TruncatedSVD,PCA

In [None]:
data=pd.read_csv('')
data.info()

In [None]:
data.isnull().sum()

In [None]:
X=data.drop(['id','Unnamed: 32','diagnosis'],axis=1)
Y=data['diagnosis']


In [None]:
lbl=LabelEncoder()
y_encoded=lbl.fit_transform(Y)
sc=StandardScaler()
X_scaled=sc.fit_transform(X)

In [None]:
X.shape

In [None]:
#n_components=X.shape[1]-1
n_components=min(X.shape)-1
n_components

In [None]:
svd=TruncatedSVD(n_components=n_components)
data_svd=svd.fit_transform(X_scaled)

In [None]:
import matplotlib.pyplot as plt
explained_variance=svd.explained_variance_ratio_.cumsum()
components=np.arange(n_components)+1
plt.plot(components,explained_variance,marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio cumsum')
plt.title('Scree Plot')
plt.grid(True)
plt.show()   

In [None]:
cumulative_explained_variance=svd.explained_variance_ratio_.cumsum()
n_components=np.argmax(cumulative_explained_variance>=0.95)+1
n_components

In [None]:
features=X.columns
loadings=svd.components_ #each components as an array
svd1=pd.Series(loadings[0],index=features)
print('components 1',svd1)
svd2=pd.Series(loadings[1],index=features)
print('Components 2',svd2)
print(loadings.shape)

In [None]:
svd_model=TruncatedSVD(n_components=10)
data_final=svd_model.fit_transform(X_scaled)
svddf=pd.DataFrame(data_final,columns=['SVD1','SVD2','SVD3','SVD4','SVD5','SVD6'
                                      ,'SVD7','SVD8','SVD9','SVD10'])
final_df=pd.concat([svddf,Y],axis=1)
final_df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X=final_df.drop('diagnosis',axis=1)
Y=final_df['diagnosis']
lbl=LabelEncoder()
y_encode=lbl.fit_transform(Y)
X_train,X_test,Y_train,Y_test=train_test_split(X,y_encoded,test_size=0.2,
                                               random_state=42)
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train,Y_train)
y_pred=lr_model.predict(X_test)
print('Model Accuracy is:',accuracy_score(Y_test,y_pred))


In [None]:
from sklearn.manifold import TSNE

In [None]:
data=pd.read_csv('ANSUR_II_FEMALE.csv')
#data.info()
data.drop('SNO',axis=1,inplace=True)
#take only the numerical columns
data_numeric=data.select_dtypes(include=['int64','float64']).dropna()
data_numeric.info()

small perplexity value it is used to capture - local structure of data
Large perplexity value it is used to capture - global and local structue
Rule is by default perplexity value is 30
perlexity < 3*sqrt(samples)
but best technique is to find the kl divergence
The point where the kl divergence is minimum is the perplexity value
[5,10,20,30,50,60]
n_components=2
2D space or 3D space

In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
perplex=[5,10,20,30,50,60]
kl_divergence=[]
for p in perplex:
    tsne=TSNE(n_components=2,perplexity=p,random_state=42)
    tsne.fit_transform(data_numeric)
    kl_divergence.append(tsne.kl_divergence_)
plt.plot(perplex,kl_divergence,marker='o')
plt.xlabel('Perplexity')
plt.ylabel('KL Divergence')
plt.title('To find the optimal Perplexity')
plt.grid(True)
plt.show()


Learning Rate - optimization parameter
very low - get stuck in lical minima
very high - unstable updates
LR- default-200
small ds - 50 -100
LDS - 500 -1000

In [None]:
#The divergence is less at 50 first huge decrease happens at this point
tsne=TSNE(n_components=2,perplexity=50,learning_rate=100,n_iter=1000)
sc=StandardScaler()
X_scaled=sc.fit_transform(data_numeric)
data_final=tsne.fit_transform(X_scaled)


In [None]:
component1=data_final[:,0]
component2=data_final[:,1]

In [None]:
print(component1)

In [None]:
data.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(18,8))
plt.scatter(data_final[:,0],data_final[:,1],alpha=0.5)
plt.xlabel('TSNE Component 1')
plt.ylabel('TSNE Component 2')
plt.title('TSNE')
plt.grid(True)
plt.show()


In [None]:
data_df=pd.DataFrame()
data_df['TSNE Comp1']=data_final[:,0]
data_df['TSNE Comp2']=data_final[:,1]
data_df['Height']=data['Height_class']
data_df.info()
sns.scatterplot(x='TSNE Comp1',y='TSNE Comp2',data=data_df,hue='Height')
plt.show()

In [None]:
X=data_df.drop('Height',axis=1)
Y=data_df['Height']
lbl=LabelEncoder()
y_encode=lbl.fit_transform(Y)
X_train,X_test,Y_train,Y_test=train_test_split(X,y_encode,test_size=0.2,
                                               random_state=42)
lr_model=LogisticRegression()
lr_model.fit(X_train,Y_train)
y_pred=lr_model.predict(X_test)
print(accuracy_score(Y_test,y_pred))
