<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#4)-Pre-processing" data-toc-modified-id="4)-Pre-processing-1">4) Pre-processing</a></span><ul class="toc-item"><li><span><a href="#4.1)-Import-packages-and-load-data" data-toc-modified-id="4.1)-Import-packages-and-load-data-1.1">4.1) Import packages and load data</a></span></li><li><span><a href="#4.2)-Creating-dummy-features" data-toc-modified-id="4.2)-Creating-dummy-features-1.2">4.2) Creating dummy features</a></span></li><li><span><a href="#4.3)-Scale-standardization" data-toc-modified-id="4.3)-Scale-standardization-1.3">4.3) Scale standardization</a></span></li><li><span><a href="#4.4)-Split-data-into-training-and-testing-subsets" data-toc-modified-id="4.4)-Split-data-into-training-and-testing-subsets-1.4">4.4) Split data into training and testing subsets</a></span></li></ul></li></ul></div>

# 4) Pre-processing

## 4.1) Import packages and load data

In [2]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# Read in the latest dataset
drug_1 = pd.read_csv('../data/drug_1.csv', index_col=0)
print(drug_1.shape)
drug_1.head()

(1885, 26)


Unnamed: 0,ID,Age_value,Gender_value,Education_value,Country_value,Ethnicity_value,Nscore,Escore,Oscore,Ascore,...,Gender,Education_level,Education,Country,Ethnicity,Age_level,Amyl_binary,Amyl_user,Cannabis_binary,Cannabis_user
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,Female,6,Professional certificate,UK,Mixed-White/Asian,3,Non-user,0,Non-user,0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,Male,9,Doctorate degree,UK,White,2,User,1,User,1
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,Male,6,Professional certificate,UK,White,3,Non-user,0,User,1
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,Female,8,Masters degree,UK,White,1,Non-user,0,User,1
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,Female,9,Doctorate degree,UK,White,3,Non-user,0,User,1


In [4]:
drug_1.columns

Index(['ID', 'Age_value', 'Gender_value', 'Education_value', 'Country_value',
       'Ethnicity_value', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore',
       'Impulsive', 'SS', 'Amyl', 'Cannabis', 'Age', 'Gender',
       'Education_level', 'Education', 'Country', 'Ethnicity', 'Age_level',
       'Amyl_binary', 'Amyl_user', 'Cannabis_binary', 'Cannabis_user'],
      dtype='object')

## 4.2) Creating dummy features
In the original dataset, participant educations were classified into nine levels (See table below). We believe it makes more sense to merge these levels into five groups: group1 (no high school degree) includes education levels 1 to 3; group2 (high school degree) includes level 4; group3 (some college experience) includes levels 5 and 6; group4 (college degree) includes level 7; and group5 (graduate degree) includes levels 8 and 9.

We then create dummy variables for our four categorical features: Age, Gender, Country and the newly created Edu_gr.

In [5]:
# Look at Education. Include 'Education_level' to sort categories
drug_1[['Education', 'Education_level']].value_counts().reset_index(name='Count').sort_values(by='Education_level')

Unnamed: 0,Education,Education_level,Count
8,Left school before 16,1,28
5,Left school at 16,2,99
7,Left school at 17,3,30
4,Left school at 18,4,100
0,Some college but no degree,5,506
3,Professional certificate,6,270
1,University degree,7,480
2,Masters degree,8,283
6,Doctorate degree,9,89


In [6]:
# Group edu into 5 groups. 
mapping = {
    1:'Edu_gr1',
    2:'Edu_gr1',
    3:'Edu_gr1',
    4:'Edu_gr2',
    5:'Edu_gr3',
    6:'Edu_gr3',
    7:'Edu_gr4',
    8:'Edu_gr5', 
    9:'Edu_gr5'
}
drug_1['Edu_gr'] = drug_1['Education_level'].map(mapping)
drug_1.columns

Index(['ID', 'Age_value', 'Gender_value', 'Education_value', 'Country_value',
       'Ethnicity_value', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore',
       'Impulsive', 'SS', 'Amyl', 'Cannabis', 'Age', 'Gender',
       'Education_level', 'Education', 'Country', 'Ethnicity', 'Age_level',
       'Amyl_binary', 'Amyl_user', 'Cannabis_binary', 'Cannabis_user',
       'Edu_gr'],
      dtype='object')

In [9]:
drug_1['Edu_gr'].value_counts().sort_index()

Edu_gr1    157
Edu_gr2    100
Edu_gr3    776
Edu_gr4    480
Edu_gr5    372
Name: Edu_gr, dtype: int64

In [11]:
# Create dummy for Gender
gender_dummies = pd.get_dummies(drug_1['Gender'], drop_first=True)
gender_dummies.head()

Unnamed: 0,Male
0,0
1,1
2,1
3,0
4,0


In [16]:
# Create dummy for Education
edu_dummies = pd.get_dummies(drug_1['Edu_gr'], drop_first=True)
edu_dummies.head()

Unnamed: 0,Edu_gr2,Edu_gr3,Edu_gr4,Edu_gr5
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1


In [18]:
# Create dummy for Country
country_dummies = pd.get_dummies(drug_1['Country'], drop_first=True)
country_dummies.head()

Unnamed: 0,Canada,New Zealand,Other,Republic of Ireland,UK,USA
0,0,0,0,0,1,0
1,0,0,0,0,1,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0


In [22]:
# Create dummy for Age
drug_1[['Age_value', 'Age', 'Age_level']].head()

Unnamed: 0,Age_value,Age,Age_level
0,0.49788,age35-44,3
1,-0.07854,age25-34,2
2,0.49788,age35-44,3
3,-0.95197,age18_24,1
4,0.49788,age35-44,3


In [23]:
# Create dummy for Age
age_dummies = pd.get_dummies(drug_1['Age'], drop_first=True)
age_dummies.head()

Unnamed: 0,age25-34,age35-44,age45-54,age55-64,age65+
0,0,1,0,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,0
4,0,1,0,0,0


In [24]:
# New dataframe drug_2 = combine dummy dataframes with the drug_1
drug_2 = pd.concat([age_dummies, gender_dummies, edu_dummies, country_dummies, drug_1], axis=1)
drug_2.sample(10)

Unnamed: 0,age25-34,age35-44,age45-54,age55-64,age65+,Male,Edu_gr2,Edu_gr3,Edu_gr4,Edu_gr5,...,Education_level,Education,Country,Ethnicity,Age_level,Amyl_binary,Amyl_user,Cannabis_binary,Cannabis_user,Edu_gr
971,0,0,0,0,0,1,0,1,0,0,...,5,Some college but no degree,USA,White,1,Non-user,0,User,1,Edu_gr3
116,0,1,0,0,0,1,0,1,0,0,...,6,Professional certificate,Canada,White,3,Non-user,0,User,1,Edu_gr3
223,0,0,0,1,0,1,0,0,1,0,...,7,University degree,UK,White,5,Non-user,0,Non-user,0,Edu_gr4
285,0,1,0,0,0,1,0,0,1,0,...,7,University degree,UK,White,3,Non-user,0,Non-user,0,Edu_gr4
219,0,1,0,0,0,0,0,1,0,0,...,5,Some college but no degree,UK,White,3,Non-user,0,Non-user,0,Edu_gr3
1515,0,0,0,0,0,1,0,1,0,0,...,6,Professional certificate,USA,White,1,Non-user,0,User,1,Edu_gr3
1206,0,0,0,0,0,1,0,1,0,0,...,5,Some college but no degree,Canada,Mixed-White/Black,1,Non-user,0,User,1,Edu_gr3
1602,1,0,0,0,0,0,0,0,1,0,...,7,University degree,UK,Asian,2,Non-user,0,Non-user,0,Edu_gr4
234,1,0,0,0,0,0,0,0,1,0,...,7,University degree,UK,White,2,Non-user,0,User,1,Edu_gr4
523,0,1,0,0,0,0,0,0,0,0,...,1,Left school before 16,UK,White,3,User,1,User,1,Edu_gr1


In [27]:
drug_2.columns
# 43 columns = 26 original, 1 gender, 6 countries, 1 Edu_gr + 4 dummies, 5 ages

Index(['age25-34', 'age35-44', 'age45-54', 'age55-64', 'age65+', 'Male',
       'Edu_gr2', 'Edu_gr3', 'Edu_gr4', 'Edu_gr5', 'Canada', 'New Zealand',
       'Other', 'Republic of Ireland', 'UK', 'USA', 'ID', 'Age_value',
       'Gender_value', 'Education_value', 'Country_value', 'Ethnicity_value',
       'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS',
       'Amyl', 'Cannabis', 'Age', 'Gender', 'Education_level', 'Education',
       'Country', 'Ethnicity', 'Age_level', 'Amyl_binary', 'Amyl_user',
       'Cannabis_binary', 'Cannabis_user', 'Edu_gr'],
      dtype='object')

## 4.3) Scale standardization
The numerical features in the dataset have been standardized.

In [25]:
# Save dataframe drug_2 to hard disk
drug_2.to_csv('../data/drug_2.csv')

In [3]:
# Read in the latest dataset in case you restart your computer
drug_2 = pd.read_csv('../data/drug_2.csv', index_col=0)
print(drug_2.shape)
drug_2.head()

(1885, 43)


Unnamed: 0,age25-34,age35-44,age45-54,age55-64,age65+,Male,Edu_gr2,Edu_gr3,Edu_gr4,Edu_gr5,...,Education_level,Education,Country,Ethnicity,Age_level,Amyl_binary,Amyl_user,Cannabis_binary,Cannabis_user,Edu_gr
0,0,1,0,0,0,0,0,1,0,0,...,6,Professional certificate,UK,Mixed-White/Asian,3,Non-user,0,Non-user,0,Edu_gr3
1,1,0,0,0,0,1,0,0,0,1,...,9,Doctorate degree,UK,White,2,User,1,User,1,Edu_gr5
2,0,1,0,0,0,1,0,1,0,0,...,6,Professional certificate,UK,White,3,Non-user,0,User,1,Edu_gr3
3,0,0,0,0,0,0,0,0,0,1,...,8,Masters degree,UK,White,1,Non-user,0,User,1,Edu_gr5
4,0,1,0,0,0,0,0,0,0,1,...,9,Doctorate degree,UK,White,3,Non-user,0,User,1,Edu_gr5


## 4.4) Split data into training and testing subsets
We will model drug consumption for Amyl and Cannabis. As mention before, for Age feature we will try it as both numerical (feature Age_value) and catetorical (feature Age) type.  
  
The code below prepares train and test subsets to model Amyl with Age as numerical type (i.e. using feature 'Age_value').

In [4]:
col_list = ['Male', 'Edu_gr2', 'Edu_gr3', 'Edu_gr4', 'Edu_gr5', 'Canada', 'New Zealand', 'Other', 'Republic of Ireland', 
            'UK', 'USA', 'Age_value', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS']
X = drug_2[col_list]
X.head()

Unnamed: 0,Male,Edu_gr2,Edu_gr3,Edu_gr4,Edu_gr5,Canada,New Zealand,Other,Republic of Ireland,UK,USA,Age_value,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS
0,0,0,1,0,0,0,0,0,0,1,0,0.49788,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,1,0,0,0,1,0,0,0,0,1,0,-0.07854,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,1,0,1,0,0,0,0,0,0,1,0,0.49788,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148
3,0,0,0,0,1,0,0,0,0,1,0,-0.95197,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0,0,0,0,1,0,0,0,0,1,0,0.49788,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575


In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
X

Unnamed: 0,Male,Edu_gr2,Edu_gr3,Edu_gr4,Edu_gr5,Canada,New Zealand,Other,Republic of Ireland,UK,USA,Age_value,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS
0,0,0,1,0,0,0,0,0,0,1,0,0.49788,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,1,0,0,0,1,0,0,0,0,1,0,-0.07854,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,1,0,1,0,0,0,0,0,0,1,0,0.49788,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148
3,0,0,0,0,1,0,0,0,0,1,0,-0.95197,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0,0,0,0,1,0,0,0,0,1,0,0.49788,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575
5,0,1,0,0,0,1,0,0,0,0,0,2.59171,-0.67825,-0.30033,-1.55521,2.03972,1.63088,-1.37983,-1.54858
6,1,0,0,0,1,0,0,0,0,0,1,1.09449,-0.46725,-1.09207,-0.45174,-0.30172,0.93949,-0.21712,0.07987
7,1,0,0,0,0,0,0,0,0,1,0,0.49788,-1.32828,1.93886,-0.84732,-0.30172,1.63088,0.19268,-0.52593
8,0,0,1,0,0,1,0,0,0,0,0,0.49788,0.62967,2.57309,-0.97631,0.76096,1.13407,-1.37983,-1.54858
9,1,0,0,0,1,0,0,0,0,1,0,1.82213,-0.24649,0.00332,-1.42424,0.59042,0.12331,-1.37983,-0.84637


In [6]:
drug_2[['Amyl', 'Amyl_binary', 'Amyl_user']].head()

Unnamed: 0,Amyl,Amyl_binary,Amyl_user
0,CL0,Non-user,0
1,CL2,User,1
2,CL0,Non-user,0
3,CL0,Non-user,0
4,CL1,Non-user,0


In [7]:
y = drug_2['Amyl_user']
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Amyl_user, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1508, 19) (1508,)
(377, 19) (377,)
