##### Part A feature engineering

In [27]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler #for standardization

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/module_5_feature_engineering_inclass.csv')

In [3]:
df.head()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain
0,1,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com
1,2,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com
2,3,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com
3,4,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com
4,5,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com


In [8]:
# Extract hour, month, day into new columns (dt helps with this)

# Transform datetime to a datetime data format
df['datetime'] = pd.to_datetime(df['last_purchase_date'])

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour 

In [9]:
df.head()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain,datetime,year,month,day,hour
0,1,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com,2020-05-11 04:13:00,2020,5,11,4
1,2,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com,1959-01-07 05:45:00,1959,1,7,5
2,3,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com,2006-10-29 16:56:00,2006,10,29,16
3,4,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com,1966-07-01 20:22:00,1966,7,1,20
4,5,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com,2013-05-14 12:44:00,2013,5,14,12


In [10]:
# Create function to classify amounts
def age_bin(x):
    if x >= 0 and x < 20:
        return 'genz'
    elif x >= 20 and x < 40:
        return 'millennial'
    elif x >= 40 and x < 60:
        return 'baby boomer'
    else:
        return '60+'

In [13]:
# Create new column with rental count range using total_cat() function created above
df['age_total_group'] = df['age'].apply(age_bin)

In [14]:
df.head()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain,datetime,year,month,day,hour,age_total_group
0,1,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com,2020-05-11 04:13:00,2020,5,11,4,millennial
1,2,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com,1959-01-07 05:45:00,1959,1,7,5,millennial
2,3,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com,2006-10-29 16:56:00,2006,10,29,16,genz
3,4,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com,1966-07-01 20:22:00,1966,7,1,20,baby boomer
4,5,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com,2013-05-14 12:44:00,2013,5,14,12,60+


In [16]:
df['favorite_product_category'].unique()

array(['books', 'home', 'electronics', 'clothing', 'beauty'], dtype=object)

In [19]:
# Dummy variables - convert season to dummies; first - rename season

favorite_cat_mapping = {'books':1, 'home':2, 'electronics':3, 'clothing':4, 'beauty':5}
df['favorite_cat'] = df['favorite_product_category'].map(favorite_cat_mapping)

#check for missing values aand represent all the options

In [20]:
df.head()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain,datetime,year,month,day,hour,age_total_group,favorite_cat
0,1,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com,2020-05-11 04:13:00,2020,5,11,4,millennial,1
1,2,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com,1959-01-07 05:45:00,1959,1,7,5,millennial,1
2,3,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com,2006-10-29 16:56:00,2006,10,29,16,genz,1
3,4,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com,1966-07-01 20:22:00,1966,7,1,20,baby boomer,1
4,5,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com,2013-05-14 12:44:00,2013,5,14,12,60+,1


In [21]:
# Create season dummies
favorite_cat_dummies = pd.get_dummies(df['favorite_cat'])

In [22]:
favorite_cat_dummies.head()

Unnamed: 0,1,2,3,4,5
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [23]:
favorite_cat_dummies.tail()

Unnamed: 0,1,2,3,4,5
995,0,1,0,0,0
996,0,0,1,0,0
997,0,0,1,0,0
998,0,0,0,1,0
999,0,0,0,1,0


In [24]:
#combine the original data set with this. axis 1 sticking columns to column 
# and axis 0 means sticking on top of each other
df = pd.concat([df,favorite_cat_dummies], axis=1)

In [25]:
df.head()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain,...,month,day,hour,age_total_group,favorite_cat,1,2,3,4,5
0,1,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com,...,5,11,4,millennial,1,1,0,0,0,0
1,2,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com,...,1,7,5,millennial,1,1,0,0,0,0
2,3,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com,...,10,29,16,genz,1,1,0,0,0,0
3,4,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com,...,7,1,20,baby boomer,1,1,0,0,0,0
4,5,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com,...,5,14,12,60+,1,1,0,0,0,0


In [26]:
df.tail()

Unnamed: 0,user_id,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain,...,month,day,hour,age_total_group,favorite_cat,1,2,3,4,5
995,996,69,1994-01-02 1:26,2002-12-06 1:07,48,1395.669474,home,Cynthiaborough,female,gmail.com,...,12,6,1,60+,2,0,1,0,0,0
996,997,35,1957-10-19 19:41,1977-07-19 0:45,45,131.001416,electronics,North Christopherside,female,gmail.com,...,7,19,0,millennial,3,0,0,1,0,0
997,998,49,1970-01-23 0:10,1986-11-01 4:45,38,102.663709,electronics,Castroland,male,hotmail.com,...,11,1,4,baby boomer,3,0,0,1,0,0
998,999,61,2016-02-26 21:04,1952-01-11 10:28,12,1022.037997,clothing,Salasmouth,female,hotmail.com,...,1,11,10,60+,4,0,0,0,1,0
999,1000,50,1982-02-02 7:25,2010-04-24 10:07,30,333.873768,clothing,Smithstad,female,yahoo.com,...,4,24,10,baby boomer,4,0,0,0,1,0


##### Part B PCA

In [29]:
accidents=sns.load_dataset('car_crashes')
accidents.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


In [32]:
accidents.drop('abbrev', axis=1, inplace=True)
# to make sure that the dataset is purely numerical. This is required before running PCA

In [33]:
accidents.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63


In [34]:
# Standardize the features
scaler = StandardScaler() # Initialize the model
accidents_std = scaler.fit_transform(accidents) # keep tranforming as you are fitting because data keeps coming 

# Perform PCA
pca = PCA()
accidents_pca = pca.fit_transform(accidents_std)

# The transformed data is an array, convert it back into a dataframe
accidents_pca = pd.DataFrame(accidents_pca, columns=[f'PC{i+1}' for i in range(len(accidents.columns))])

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
accidents_pca.head()

# the results show that the last component adds just 8%. 
# trading off accuracy for efficiency
# remove the last 2 because it removes just 8%. The goal is to retain 90% of the data.

Explained variance ratio: [0.57342168 0.22543042 0.07865743 0.05007557 0.04011    0.02837999
 0.00392491]
Cumulative explained variance ratio: [0.57342168 0.7988521  0.87750953 0.9275851  0.9676951  0.99607509
 1.        ]


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,1.603671,0.133449,0.317881,-0.795293,-0.579719,0.046223,0.210185
1,1.144212,0.858234,0.736626,0.318988,-0.228701,-1.002625,0.008966
2,1.432172,-0.420506,0.338136,0.552513,0.168718,-0.804523,-0.076107
3,2.491584,0.348968,-1.788747,0.264064,-0.372382,-0.481849,-0.147636
4,-1.750638,0.633625,-0.136176,-0.974916,-0.315811,0.17851,-0.068958


In [36]:
accidents_std #show the arrays with the means

array([[ 0.73744574,  1.1681476 ,  0.43993758,  1.00230055,  0.27769155,
        -0.58008306,  0.4305138 ],
       [ 0.56593556,  1.2126951 , -0.21131068,  0.60853209,  0.80725756,
         0.94325764, -0.02289992],
       [ 0.68844283,  0.75670887,  0.18761539,  0.45935701,  1.03314134,
         0.0708756 , -0.98177845],
       [ 1.61949811, -0.48361373,  0.54740815,  1.67605228,  1.95169961,
        -0.33770122,  0.32112519],
       [-0.92865317, -0.39952407, -0.8917629 , -0.594276  , -0.89196792,
        -0.04841772,  1.26617765],
       [-0.5366299 ,  0.01692   , -0.63009543, -0.63369765, -0.29104195,
        -0.2914793 ,  0.22027622],
       [-1.22267063, -0.01511416, -0.5833691 , -0.9356316 , -1.38129335,
         1.02964051,  1.32270187],
       [ 0.10040792,  0.57951992, -0.01564416,  0.1166575 ,  0.54542553,
         1.42128062,  0.6907692 ],
       [-2.42324191, -1.49769509, -1.92383077, -1.71868879, -2.17430102,
         2.19175919,  0.06330968],
       [ 0.51693265, -0.6202

In [38]:
# Let's do the same, but now let's reduce to 2 components
# Perform PCA
pca = PCA(n_components = 0.9)
accidents_pca = pca.fit_transform(accidents_std)

# The transformed data is an array, convert it back into a dataframe
accidents_pca = pd.DataFrame(accidents_pca)

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
accidents_pca.head()

#the cummulative explained variance is 0.92 which is a good choice because you are losing only about 8% of the data. 

Explained variance ratio: [0.57342168 0.22543042 0.07865743 0.05007557]
Cumulative explained variance ratio: [0.57342168 0.7988521  0.87750953 0.9275851 ]


Unnamed: 0,0,1,2,3
0,1.603671,0.133449,0.317881,-0.795293
1,1.144212,0.858234,0.736626,0.318988
2,1.432172,-0.420506,0.338136,0.552513
3,2.491584,0.348968,-1.788747,0.264064
4,-1.750638,0.633625,-0.136176,-0.974916


The Principal Component Analysis (PCA) using the variance ratio and the cumulative explained variance ratio provide insights into the amount of information retained by each principal component.

The explained variance ratio represents the proportion of the dataset's variance explained by each principal component. The array [0.57342168, 0.22543042, 0.07865743, 0.05007557] indicates that the first principal component explains approximately 57.34% of the variance, the second principal component explains 22.54% of the variance, the third explains 7.87% of the variance, and the fourth explains 5.01% of the variance. These values indicate the relative importance of each principal component in capturing the variability in the data.

The cumulative explained variance ratio, on the other hand, shows the cumulative proportion of variance explained by the principal components. The array [0.57342168, 0.7988521, 0.87750953, 0.9275851] indicates that the first principal component explains 57.34% of the variance, the first two components together explain approximately 79.89% of the variance, the first three components explain 87.75% of the variance, and all four components combined explain around 92.76% of the variance. The cumulative explained variance ratio helps determine how much information is retained as more principal components are considered.

These ratios are useful in deciding how many principal components to retain in a PCA analysis. By examining the cumulative explained variance ratio, we determine the number of components needed to capture a desired amount of variance.

##### The end