In [83]:
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline

file = '../../input/drug_data.csv'
drug_data = pd.read_csv(file,'rb', delimiter =';')

In [84]:
drug_data.head(8)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,49788,48246,-5921,96082,126,31287,-57545,-58331,-91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-7854,-48246,198437,96082,-31685,-67825,193886,143533,76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,49788,-48246,-5921,96082,-31685,-46725,80523,-84732,-16209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-95197,48246,116365,96082,-31685,-14882,-80615,-1928,59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,49788,48246,198437,96082,-31685,73545,-16334,-45174,-30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
5,6,259171,48246,-122751,24923,-31685,-67825,-30033,-155521,203972,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
6,7,109449,-48246,116365,-57009,-31685,-46725,-109207,-45174,-30172,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
7,8,49788,-48246,-17379,96082,-31685,-132828,193886,-84732,-30172,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0


# <h3> Data manipulation (Object to numericals) 

In [85]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            print(column)
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    print(unique + " : " + str(x))
                    text_digit_vals[unique] = x
                    x+=1        
            df[column] = list(map(convert_to_int, df[column])) 
    return df



In [86]:
for i in range(13,32):
    drug_data[drug_data.columns[i]] = drug_data[drug_data.columns[i]].str.replace('CL','').astype(int)

In [87]:
for i in range(1,13):
    drug_data[drug_data.columns[i]] = drug_data[drug_data.columns[i]].str.replace(',','.').astype(np.float64)

In [88]:
def change_age(cls):
    if cls == -0.95197:
        return '18-24'
    elif cls == -0.07854:
        return '25-34'
    elif cls == 0.49788:
        return '35-44'
    elif cls == 1.09449:
        return '45-54'
    elif cls == 1.82213:
        return '55-64'
    elif cls == 2.59171:
        return '65+'
    
def change_gender(cls):
    if cls == 0.48246:
        return "male"
    elif cls == -0.48246:
        return "female"

def change_country(cls):
    if cls == -0.09765:
        return "Australia"
    elif cls == 0.24923:
        return "Canada"
    elif cls == -0.46841:
        return "New Zealand"
    elif cls == -0.28519:
        return "Other"
    elif cls == 0.21128:
        return "Rep.of Ireland"
    elif cls == 0.96082:
        return "UK"
    elif cls == -0.57009:
        return "USA"
    
def change_ethnicity(cls):
    if cls == -0.50212:
        return "Asian"
    elif cls == -1.10702:
        return "Black"
    elif cls == 1.90725:
        return "Black/Asian"
    elif cls == 0.12600:
        return "White/Asian"
    elif cls == -0.22166:
        return "White/Black"
    elif cls == 0.11440:
        return "Other"
    elif cls == -0.31685:
        return "White"

In [89]:
drug_data['Age'] = drug_data['Age'].apply(change_age)

In [90]:
drug_data['Gender'] = drug_data['Gender'].apply(change_gender)

In [91]:
drug_data['Country'] = drug_data['Country'].apply(change_country)

In [92]:
drug_data['Ethnicity'] = drug_data['Ethnicity'].apply(change_ethnicity)

In [93]:
drug_data.head(8)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,35-44,male,-0.05921,UK,White/Asian,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,2,0,0
1,2,25-34,female,1.98437,UK,White,-0.67825,1.93886,1.43533,0.76096,...,4,0,2,0,2,3,0,4,0,0
2,3,35-44,female,-0.05921,UK,White,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,1,0,0,0
3,4,18-24,male,1.16365,UK,White,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,2,0,0,0,0,2,0,0
4,5,35-44,male,1.98437,UK,White,0.73545,-1.6334,-0.45174,-0.30172,...,1,0,0,1,0,0,2,2,0,0
5,6,65+,male,-1.22751,Canada,White,-0.67825,-0.30033,-1.55521,2.03972,...,0,0,0,0,0,0,0,6,0,0
6,7,45-54,female,1.16365,USA,White,-0.46725,-1.09207,-0.45174,-0.30172,...,0,0,0,0,0,0,0,6,0,0
7,8,35-44,female,-1.7379,UK,White,-1.32828,1.93886,-0.84732,-0.30172,...,0,0,0,0,0,0,0,0,0,0


In [94]:
# drug_data = handle_non_numerical_data(drug_data)


In [95]:
drug_data.head(8)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,35-44,male,-0.05921,UK,White/Asian,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,2,0,0
1,2,25-34,female,1.98437,UK,White,-0.67825,1.93886,1.43533,0.76096,...,4,0,2,0,2,3,0,4,0,0
2,3,35-44,female,-0.05921,UK,White,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,1,0,0,0
3,4,18-24,male,1.16365,UK,White,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,2,0,0,0,0,2,0,0
4,5,35-44,male,1.98437,UK,White,0.73545,-1.6334,-0.45174,-0.30172,...,1,0,0,1,0,0,2,2,0,0
5,6,65+,male,-1.22751,Canada,White,-0.67825,-0.30033,-1.55521,2.03972,...,0,0,0,0,0,0,0,6,0,0
6,7,45-54,female,1.16365,USA,White,-0.46725,-1.09207,-0.45174,-0.30172,...,0,0,0,0,0,0,0,6,0,0
7,8,35-44,female,-1.7379,UK,White,-1.32828,1.93886,-0.84732,-0.30172,...,0,0,0,0,0,0,0,0,0,0


In [96]:
drug_data.dtypes

ID             int64
Age           object
Gender        object
Education    float64
Country       object
Ethnicity     object
Nscore       float64
Escore       float64
Oscore       float64
Ascore       float64
Cscore       float64
Impulsive    float64
SS           float64
Alcohol        int64
Amphet         int64
Amyl           int64
Benzos         int64
Caffeine       int64
Cannabis       int64
Chocolate      int64
Cocaine        int64
Crack          int64
Ecstasy        int64
Heroin         int64
Ketamine       int64
Legalh         int64
LSD            int64
Meth           int64
Mushrooms      int64
Nicotine       int64
Semer          int64
VSA            int64
dtype: object

In [98]:
df = pd.DataFrame(drug_data)
print (df)

        ID    Age  Gender  Education         Country    Ethnicity   Nscore  \
0        1  35-44    male   -0.05921              UK  White/Asian  0.31287   
1        2  25-34  female    1.98437              UK        White -0.67825   
2        3  35-44  female   -0.05921              UK        White -0.46725   
3        4  18-24    male    1.16365              UK        White -0.14882   
4        5  35-44    male    1.98437              UK        White  0.73545   
5        6    65+    male   -1.22751          Canada        White -0.67825   
6        7  45-54  female    1.16365             USA        White -0.46725   
7        8  35-44  female   -1.73790              UK        White -1.32828   
8        9  35-44    male   -0.05921          Canada        White  0.62967   
9       10  55-64  female    1.16365              UK        White -0.24649   
10      11  25-34    male    0.45468              UK        White -1.05308   
11      12  45-54  female   -0.61113           Other        Whit

In [107]:
import csv


In [108]:
with open('../../input/data_manipulated.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=';')
    df.to_csv(fp, sep=';', encoding='utf-8', mode='w')
    