In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## For regresion dataset

In [183]:
from sklearn.datasets import load_diabetes,load_wine

In [50]:
diabetes_dataset = load_diabetes()

In [51]:
diabetes_dataset

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]], shape=(442, 10)),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142.

In [52]:
df = pd.DataFrame(data=diabetes_dataset['data'],columns=diabetes_dataset['feature_names'])

In [53]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


##  Principal Component Analysis
PCA (Principal Component Analysis) is a dimensionality reduction technique used mainly for simplifying complex datasets while keeping as much important information as possible.

In [54]:
# standardization
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

In [55]:
scalar.fit(df)

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [113]:
scala_df = scalar.transform(df)

In [114]:
# extracting the features which carry the most information
from sklearn.decomposition import PCA

In [171]:
pca = PCA(n_components=5) # n_components=2 means total number of feature we want to extract

In [172]:
data_pca = pca.fit_transform(scala_df)

In [173]:
data = pd.DataFrame(data_pca)
# here we can see that now features are converted into two columns
# this process is called dimensionality reduction
data.head()

Unnamed: 0,0,1,2,3,4
0,0.616506,-1.924739,0.719085,0.068396,0.302893
1,-2.820136,1.374185,0.024573,0.475693,0.239057
2,0.321437,-1.601916,0.961294,0.816719,1.189427
3,0.033057,0.277676,-2.14655,-1.289966,-0.171114
4,-0.753232,0.806911,-0.085368,0.150749,0.245479


In [174]:
pca.explained_variance_

array([4.02242801, 1.50202833, 1.21701731, 0.96960587, 0.67601371])

## Compare the model before and after the PCA

In [175]:
x = df
y = diabetes_dataset['target']

In [127]:
# spliting into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [147]:
# before PCA 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,root_mean_squared_error

linear_regression = LinearRegression()
linear_regression.fit(x_train,y_train)
y_pred = linear_regression.predict(x_test)

score = r2_score(y_test,y_pred)
RMSE = root_mean_squared_error(y_test,y_pred)

print(f"R2 score:{score}")
print(f"Root Mean Squared Error:{RMSE}")


R2 score:0.45260276297191937
Root Mean Squared Error:53.85344583676593


In [176]:
# after doing the pca 
x_pca = pd.DataFrame(data_pca)
y_pca = diabetes_dataset['target']

In [177]:
x_train_pca,x_test_pca,y_train_pca,y_test_pca = train_test_split(x_pca,y_pca,test_size=0.3,random_state=42)

In [181]:
# trainig the model after doing the PCA
linear_regression_pca = LinearRegression()
linear_regression_pca.fit(x_train_pca,y_train_pca)
y_pred_pca = linear_regression_pca.predict(x_test_pca)

score_pca = r2_score(y_test_pca,y_pred_pca)
RMSE_pca = root_mean_squared_error(y_test_pca,y_pred_pca)

print(f"R2 score:{score_pca}")
print(f"Root Mean Squared Error:{RMSE_pca}")


R2 score:0.48530585685783356
Root Mean Squared Error:52.71126433620786


In [None]:
# For the regression problem PCA doesn't make huge image 
#  we can see the result before and after it doesn't make huge change after the PCA the result is 
# slighty improve and error is less 
# before r2_score is around 0.45 after PCA it is around 0.48 slightly improve

## For Classification dataset

In [184]:
wine_dataset = load_wine()

In [186]:
wine_dataset

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]], shape=(178, 13)),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [200]:
scale_wine_df = scalar.fit_transform(pd.DataFrame(data=wine_dataset['data'],columns=wine_dataset['feature_names']))

In [202]:
wine_pca = PCA(n_components=3)

In [203]:
wine_pca_data = wine_pca.fit_transform(scale_wine_df)

In [205]:
pd.DataFrame(wine_pca_data)
# here we extract 3 from 13 features

Unnamed: 0,0,1,2
0,3.316751,1.443463,-0.165739
1,2.209465,-0.333393,-2.026457
2,2.516740,1.031151,0.982819
3,3.757066,2.756372,-0.176192
4,1.008908,0.869831,2.026688
...,...,...,...
173,-3.370524,2.216289,-0.342570
174,-2.601956,1.757229,0.207581
175,-2.677839,2.760899,-0.940942
176,-2.387017,2.297347,-0.550696


In [206]:
wine_pca.explained_variance_

array([4.73243698, 2.51108093, 1.45424187])

## Traing the model with PCA dataset

In [None]:
x_wine_pca = pd.DataFrame(wine_pca_data)
y_wine_pca = wine_dataset['target']

In [210]:
# spliting the data
x_wine_train_pca,x_wine_test_pca,y_wine_train_pca,y_wine_test_pca = train_test_split(x_wine_pca,y_wine_pca,test_size=0.2, random_state=42)

In [211]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
logistic_reg_pca = LogisticRegression()
logistic_reg_pca.fit(x_wine_train_pca,y_wine_train_pca)
y_pred_pca = logistic_reg_pca.predict(x_wine_test_pca)

accuracy = accuracy_score(y_wine_test_pca,y_pred_pca)
metrix = confusion_matrix(y_wine_test_pca,y_pred_pca)
report = classification_report(y_wine_test_pca,y_pred_pca)

print(f"Accuracy:{accuracy}")
print(f"Confusion metrix:{metrix}")
print(f"Classification report:{report}")

Accuracy:0.9722222222222222
Confusion metrix:[[13  1  0]
 [ 0 14  0]
 [ 0  0  8]]
Classification report:              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.93      1.00      0.97        14
           2       1.00      1.00      1.00         8

    accuracy                           0.97        36
   macro avg       0.98      0.98      0.98        36
weighted avg       0.97      0.97      0.97        36



## Traning model without PCA

In [193]:
x_wine= pd.DataFrame(data=wine_dataset['data'],columns=wine_dataset['feature_names'])
y_wine = wine_dataset['target']

In [194]:
y_wine

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [214]:
x_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [213]:
# scaling and spliting the dataset
x_wine_train,x_wine_test,y_wine_train,y_wine_test = train_test_split(x_wine,y_wine,test_size=0.25,random_state=42)

In [212]:
x_wine_train_scaled = scalar.fit_transform(x_wine_train)
x_wine_test_scaled = scalar.transform(x_wine_test)

In [215]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
logistic_reg = LogisticRegression()
logistic_reg.fit(x_wine_train_scaled,y_wine_train)
y_wine_pred = logistic_reg.predict(x_wine_test)

accuracy = accuracy_score(y_wine_test,y_wine_pred)
metrix = confusion_matrix(y_wine_test,y_wine_pred)
report = classification_report(y_wine_test,y_wine_pred)

print(f"Accuracy:{accuracy}")
print(f"Confusion metrix:{metrix}")
print(f"Classification report:{report}")

Accuracy:0.3333333333333333
Confusion metrix:[[15  0  0]
 [18  0  0]
 [12  0  0]]
Classification report:              precision    recall  f1-score   support

           0       0.33      1.00      0.50        15
           1       0.00      0.00      0.00        18
           2       0.00      0.00      0.00        12

    accuracy                           0.33        45
   macro avg       0.11      0.33      0.17        45
weighted avg       0.11      0.33      0.17        45



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# In classification problem PCA made the direct impact on accuracy
#  when I provide the dataset after PCA accuracy level is around 0.98 while providing all features
# it gives the very less accuracy around 0.33