In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [3]:
data = pd.read_csv('advertising-pca.csv')
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
0,68.95,35,61833.9,256.09,0
1,80.23,31,68441.85,193.77,0
2,69.47,26,59785.94,236.5,0
3,74.15,29,54806.18,245.89,0
4,68.37,35,73889.99,225.58,0


In [4]:
data.shape

(1000, 5)

In [5]:
X = data.drop('Clicked on Ad', axis = 1)
X.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage
0,68.95,35,61833.9,256.09
1,80.23,31,68441.85,193.77
2,69.47,26,59785.94,236.5
3,74.15,29,54806.18,245.89
4,68.37,35,73889.99,225.58


In [6]:
X.shape

(1000, 4)

In [7]:
# Step 1: Apply PCA on the indepedent features (varaiables) X

In [8]:
pca = PCA()
principalComponents = pca.fit_transform(X)
print(principalComponents)

[[ 6.83398086e+03  6.74604261e+01 -1.18287949e+01  3.03639225e+00]
 [ 1.34418621e+04  1.08971998e+00  1.06819117e+01 -1.82313102e+00]
 [ 4.78600189e+03  5.13795392e+01 -5.92500578e+00 -6.65359855e+00]
 ...
 [-1.25843440e+04 -4.74117235e+01 -1.57038974e+00  9.96362410e+00]
 [-1.30791938e+04  2.24368936e+01 -6.18988203e+00 -1.80664444e+01]
 [-2.51241908e+04  2.45370957e+01 -1.36332550e+01 -1.33558410e+01]]


In [9]:
principalComponents.shape

(1000, 4)

In [10]:
# Step 2 : Check for the variance ratio of the individual principal components

In [11]:
pca.explained_variance_ratio_

array([9.99988833e-01, 9.84025892e-06, 9.74140204e-07, 3.52420098e-07])

In [12]:
np.round(pca.explained_variance_ratio_, 2)

array([1., 0., 0., 0.])

In [13]:
# Step 3 : Decide the number of components that the project needs and recreate the PCA model
# and the principal components

In [14]:
# Create a PCA model for 2 components
pca = PCA(n_components = 1)
principalComponents = pca.fit_transform(X)
print(principalComponents)

[[ 6.83398086e+03]
 [ 1.34418621e+04]
 [ 4.78600189e+03]
 [-1.93742969e+02]
 [ 1.88900287e+04]
 [ 4.76160800e+03]
 [-1.14710882e+03]
 [-3.04067036e+04]
 [ 1.38620405e+04]
 [ 6.42327407e+02]
 [-9.36755564e+03]
 [ 7.49106750e+03]
 [-3.36315140e+03]
 [-3.26032327e+03]
 [-2.40240316e+04]
 [-2.81781064e+03]
 [-3.10631783e+04]
 [ 1.65110826e+04]
 [-2.39125155e+04]
 [-3.11783048e+04]
 [ 9.80237750e+03]
 [ 5.01562521e+03]
 [-2.23643121e+04]
 [ 6.62875663e+03]
 [ 1.39622870e+04]
 [ 9.82802658e+03]
 [-1.69329642e+04]
 [ 3.29575488e+03]
 [-2.22911098e+04]
 [-8.81998538e+03]
 [-3.52662047e+03]
 [-9.40602446e+03]
 [-2.94167065e+04]
 [-2.47719690e+04]
 [-9.41912986e+03]
 [ 6.38952722e+03]
 [ 1.77072542e+03]
 [ 2.14353315e+04]
 [ 2.42579469e+03]
 [-2.74916010e+04]
 [ 2.69198123e+03]
 [ 4.78425701e+03]
 [ 1.15724089e+04]
 [ 9.92964542e+03]
 [ 2.51967266e+03]
 [-1.42453898e+03]
 [-4.01621914e+03]
 [ 1.20587390e+04]
 [-2.27672840e+03]
 [-7.13973380e+02]
 [ 6.52626847e+03]
 [ 3.52609398e+03]
 [-1.6498866

In [15]:
# By losing 2 coulmns we are still able to retain 98% information of the dataset

In [16]:
# Step 4 : Create a new dataframe with these Principal components and the labels (y)

In [17]:
df_transformed = pd.DataFrame(principalComponents, columns=['PC1'])
df_transformed

Unnamed: 0,PC1
0,6833.980857
1,13441.862122
2,4786.001889
3,-193.742969
4,18890.028690
...,...
995,16384.593917
996,12782.104713
997,-12584.344034
998,-13079.193783


In [18]:
df_transformed['Clicked on Ad'] = data['Clicked on Ad']
df_transformed.head()

Unnamed: 0,PC1,Clicked on Ad
0,6833.980857,0
1,13441.862122,0
2,4786.001889,0
3,-193.742969,0
4,18890.02869,0


In [19]:
df_transformed.shape

(1000, 2)

In [20]:
df_transformed['Clicked on Ad'].value_counts()

Clicked on Ad
0    500
1    500
Name: count, dtype: int64

In [21]:
# Step 5: Create a Logistic Regression model on this dataset

In [22]:
X = df_transformed.drop('Clicked on Ad', axis = 1)
y = df_transformed['Clicked on Ad']

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X,y)

In [24]:
a = float(input('Daily Time Spent on Site: '))
b = float(input('Age: '))
c = float(input('Area Income: '))
d = float(input('Daily Internet Usage: '))
featureSet = np.array([[a, b, c, d]])
transformed_ds = pca.transform(featureSet)
model.predict(transformed_ds)

Daily Time Spent on Site:  73
Age:  30
Area Income:  75000
Daily Internet Usage:  210


array([0])