DATASET to UPLOAD: bfi

#**Exploratory Factor Analysis (EFA)**

In [None]:
pip install factor_analyzer

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv("bfi.csv")
df.columns

In [None]:
df.drop(['rownames','gender', 'education', 'age'],axis=1,inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity

chi_square_value, p_value = calculate_bartlett_sphericity(df)

chi_square_value, p_value

In [None]:
from factor_analyzer.factor_analyzer import calculate_kmo

kmo_all, kmo_model = calculate_kmo(df)

kmo_model

In [None]:
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer()
fa.set_params(n_factors = 25, rotation = None)
fa.fit(df)

# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev

In [None]:
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
fa = FactorAnalyzer()
fa.set_params(n_factors = 6, rotation = "varimax")
fa.fit(df)

loadings = fa.loadings_
print(pd.DataFrame(loadings, index = df.columns))

In [None]:
fa = FactorAnalyzer(n_factors = 5, rotation = "varimax")
# fa.set_params(n_factors = 5, rotation = "varimax")
fa.fit(df)

loadings = fa.loadings_
print(pd.DataFrame(loadings, index = df.columns))

In [None]:
print(pd.DataFrame(fa.get_factor_variance(),index=['Variance','Proportional Var','Cumulative Var']))


In [None]:
print(pd.DataFrame(fa.get_communalities(),index=df.columns,columns=['Communalities']))

#**Principal Component Analysis (PCA)**

In [None]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

breast = load_breast_cancer()
breast_data = breast.data
breast_labels = breast.target

print("breast_data count: ", breast_data.shape)
print("breast_data labels: ", breast_labels.shape)

labels = np.reshape(breast_labels,(569,1))
final_breast_data = np.concatenate([breast_data,labels],axis=1)
print("Final breast_data count: ", final_breast_data.shape)

bdf = pd.DataFrame(final_breast_data)
features = breast.feature_names
features_labels = np.append(features,'label')
bdf.columns = features_labels
bdf['label'].replace(0, 'Benign',inplace=True)
bdf['label'].replace(1, 'Malignant',inplace=True)
bdf.tail()


In [None]:
x = bdf.loc[:, features].values
x = StandardScaler().fit_transform(x) # normalizing the features
x.shape

In [None]:
np.mean(x), np.std(x)

In [None]:
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]
nbdf = pd.DataFrame(x,columns=feat_cols)
nbdf.tail()



In [None]:
from sklearn.decomposition import PCA


In [None]:
pca_breast = PCA(n_components=2)
principalComponents_breast = pca_breast.fit_transform(x)

In [None]:
pbdf = pd.DataFrame(data = principalComponents_breast
             , columns = ['principal component 1', 'principal component 2'])
pbdf.tail()

In [None]:
print('Explained variation per principal component: {}'.format(pca_breast.explained_variance_ratio_))


Explained variation per principal component: [0.44272026 0.18971182]


In [None]:
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis of Breast Cancer Dataset",fontsize=20)
targets = ['Benign', 'Malignant']
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = bdf['label'] == target
    plt.scatter(pbdf.loc[indicesToKeep, 'principal component 1'], pbdf.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})

In [None]:
pca = PCA().fit(x)

plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(0, 30, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

In [None]:
pca_breast = PCA(n_components=10)
principalComponents_breast = pca_breast.fit_transform(x)

In [None]:
name_of_column = [['pc'+str(i) for i in range(10)]]

In [None]:
pbdf = pd.DataFrame(data = principalComponents_breast
             , columns = name_of_column)
pbdf.tail()

In [None]:
print('Explained variation per principal component: {}'.format(pca_breast.explained_variance_ratio_))
