Importing the packages required

In [358]:
%matplotlib inline
import pandas as pd
import copy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import Imputer
#import corrplot

# styling
pd.set_option('display.max_columns',150)
plt.style.use('bmh')
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

# a bit of machine learning
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2,SelectFromModel

Implementing the special Label Binarizer to implement One-hot encoding.

In [359]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer


class LabelBinarizer2:

    def __init__(self):
        self.lb = LabelBinarizer()

    def fit(self, X):
        # Convert X to array
        X = np.array(X)
        # Fit X using the LabelBinarizer object
        self.lb.fit(X)
        # Save the classes
        self.classes_ = self.lb.classes_

    def fit_transform(self, X):
        # Convert X to array
        X = np.array(X)
        # Fit + transform X using the LabelBinarizer object
        Xlb = self.lb.fit_transform(X)
        # Save the classes
        self.classes_ = self.lb.classes_
        if len(self.classes_) == 2:
            Xlb = np.hstack((Xlb, 1 - Xlb))
        return Xlb

    def transform(self, X):
        # Convert X to array
        X = np.array(X)
        # Transform X using the LabelBinarizer object
        Xlb = self.lb.transform(X)
        if len(self.classes_) == 2:
            Xlb = np.hstack((Xlb, 1 - Xlb))
        return Xlb

    def inverse_transform(self, Xlb):
        # Convert Xlb to array
        Xlb = np.array(Xlb)
        if len(self.classes_) == 2:
            X = self.lb.inverse_transform(Xlb[:, 0])
        else:
            X = self.lb.inverse_transform(Xlb)
        return X

In [377]:
young = pd.read_csv('young-people-survey/responses.csv')
young.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,Punk,"Hiphop, Rap","Reggae, Ska","Swing, Jazz",Rock n roll,Alternative,Latino,"Techno, Trance",Opera,Movies,Horror,Thriller,Comedy,Romantic,Sci-fi,War,Fantasy/Fairy tales,Animated,Documentary,Western,Action,History,Psychology,Politics,Mathematics,Physics,Internet,PC,Economy Management,Biology,Chemistry,Reading,Geography,Foreign languages,Medicine,Law,Cars,Art exhibitions,Religion,"Countryside, outdoors",Dancing,Musical instruments,Writing,Passive sport,Active sport,Gardening,Celebrities,Shopping,Science and technology,Theatre,Fun with friends,Adrenaline sports,Pets,Flying,Storm,Darkness,Heights,Spiders,Snakes,Rats,Ageing,Dangerous dogs,Fear of public speaking,Smoking,Alcohol,Healthy eating,Daily events,Prioritising workload,Writing notes,Workaholism,Thinking ahead,Final judgement,Reliability,Keeping promises,Loss of interest,Friends versus money,Funniness,Fake,Criminal damage,Decision making,Elections,Self-criticism,Judgment calls,Hypochondria,Empathy,Eating to survive,Giving,Compassion to animals,Borrowed stuff,Loneliness,Cheating in school,Health,Changing the past,God,Dreams,Charity,Number of friends,Punctuality,Lying,Waiting,New environment,Mood swings,Appearence and gestures,Socializing,Achievements,Responding to a serious letter,Children,Assertiveness,Getting angry,Knowing the right people,Public speaking,Unpopularity,Life struggles,Happiness in life,Energy levels,Small - big dogs,Personality,Finding lost valuables,Getting up,Interests or hobbies,Parents' advice,Questionnaires or polls,Internet usage,Finances,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,5.0,5.0,3.0,1.0,2.0,1.0,5.0,1.0,3.0,3.0,5.0,3.0,5.0,3.0,3.0,3.0,3.0,5.0,3.0,1.0,1.0,1.0,1.0,5.0,3.0,3.0,2.0,1.0,5.0,5.0,1.0,4.0,4.0,2.0,5.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,5,3.0,1.0,3.0,2.0,never smoked,drink a lot,4.0,2.0,2.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,3.0,5.0,1.0,1.0,3.0,4.0,1.0,3.0,1.0,3.0,1,4.0,5.0,4.0,3.0,2.0,1.0,1.0,1.0,4,2.0,3,i am always on time,never,3.0,4.0,3.0,4.0,3.0,4.0,3.0,5.0,1.0,1.0,3.0,5.0,5.0,1.0,4.0,5.0,1.0,4.0,3.0,2.0,3.0,4.0,3.0,few hours a day,3.0,4.0,5.0,3.0,3.0,1,3.0,20.0,163.0,48.0,1.0,female,right handed,college/bachelor degree,no,village,block of flats
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,4.0,1.0,3.0,1.0,4.0,4.0,2.0,1.0,1.0,5.0,2.0,2.0,4.0,3.0,4.0,1.0,3.0,5.0,4.0,1.0,4.0,1.0,3.0,4.0,5.0,2.0,4.0,4.0,5.0,1.0,1.0,4.0,4.0,5.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,2.0,4.0,2.0,5.0,1.0,1.0,1.0,2.0,1.0,1,1.0,3.0,1.0,4.0,never smoked,drink a lot,3.0,3.0,2.0,4.0,5.0,4.0,1.0,4.0,4.0,3.0,4.0,3.0,2.0,1.0,2.0,5.0,4.0,4.0,1.0,2.0,1,2.0,4.0,3.0,2.0,4.0,4.0,4.0,1.0,3,1.0,3,i am often early,sometimes,3.0,4.0,4.0,4.0,4.0,2.0,4.0,2.0,2.0,5.0,4.0,4.0,4.0,1.0,4.0,3.0,5.0,3.0,4.0,5.0,3.0,2.0,3.0,few hours a day,3.0,4.0,1.0,4.0,2.0,5,2.0,19.0,163.0,58.0,2.0,female,right handed,college/bachelor degree,no,city,block of flats
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,4.0,1.0,4.0,3.0,5.0,5.0,5.0,1.0,3.0,5.0,3.0,4.0,4.0,2.0,4.0,2.0,5.0,5.0,2.0,2.0,1.0,1.0,2.0,1.0,5.0,2.0,4.0,2.0,4.0,1.0,1.0,5.0,2.0,5.0,2.0,3.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,1.0,1.0,4.0,2.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,2.0,tried smoking,drink a lot,3.0,1.0,2.0,5.0,3.0,5.0,3.0,4.0,5.0,1.0,5.0,2.0,4.0,1.0,3.0,5.0,4.0,4.0,1.0,5.0,5,5.0,4.0,2.0,5.0,3.0,2.0,5.0,5.0,1,3.0,3,i am often running late,sometimes,2.0,3.0,4.0,3.0,5.0,3.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,5.0,3.0,1.0,few hours a day,2.0,4.0,1.0,4.0,3.0,4,2.0,20.0,176.0,67.0,2.0,female,right handed,secondary school,no,city,block of flats
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,4.0,2.0,2.0,1.0,2.0,5.0,1.0,2.0,1.0,5.0,4.0,4.0,3.0,3.0,4.0,3.0,1.0,2.0,5.0,1.0,2.0,4.0,4.0,5.0,4.0,1.0,3.0,1.0,2.0,3.0,3.0,5.0,4.0,4.0,2.0,5.0,1.0,5.0,4.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,4.0,3.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,3.0,5.0,5,5.0,4.0,5.0,5.0,former smoker,drink a lot,3.0,4.0,4.0,4.0,5.0,3.0,1.0,3.0,4.0,5.0,2.0,1.0,1.0,5.0,5.0,5.0,5.0,4.0,3.0,3.0,1,1.0,2.0,5.0,5.0,5.0,1.0,5.0,4.0,3,3.0,1,i am often early,only to avoid hurting someone,1.0,1.0,5.0,3.0,1.0,3.0,3.0,2.0,5.0,5.0,4.0,5.0,3.0,3.0,2.0,2.0,1.0,2.0,1.0,1.0,,2.0,4.0,most of the day,2.0,4.0,3.0,3.0,4.0,4,1.0,22.0,172.0,59.0,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,2.0,5.0,3.0,2.0,1.0,2.0,4.0,2.0,2.0,5.0,4.0,4.0,5.0,2.0,3.0,3.0,4.0,4.0,3.0,1.0,4.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,5.0,2.0,3.0,3.0,2.0,3.0,1.0,4.0,4.0,1.0,3.0,1.0,3.0,1.0,4.0,3.0,3.0,3.0,2.0,4.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1,2.0,2.0,4.0,3.0,tried smoking,social drinker,4.0,3.0,1.0,2.0,3.0,5.0,5.0,5.0,4.0,2.0,3.0,3.0,2.0,1.0,3.0,5.0,5.0,5.0,1.0,3.0,1,3.0,3.0,4.0,3.0,5.0,3.0,4.0,5.0,3,3.0,3,i am always on time,everytime it suits me,3.0,4.0,2.0,3.0,3.0,3.0,3.0,5.0,4.0,2.0,3.0,5.0,5.0,2.0,3.0,5.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,few hours a day,4.0,3.0,4.0,3.0,3.0,2,4.0,20.0,170.0,59.0,1.0,female,right handed,secondary school,no,village,house/bungalow


Import the data from .csv file and then checking the data type

In [378]:

young.dtypes

Music                         float64
Slow songs or fast songs      float64
Dance                         float64
Folk                          float64
Country                       float64
Classical music               float64
Musical                       float64
Pop                           float64
Rock                          float64
Metal or Hardrock             float64
Punk                          float64
Hiphop, Rap                   float64
Reggae, Ska                   float64
Swing, Jazz                   float64
Rock n roll                   float64
Alternative                   float64
Latino                        float64
Techno, Trance                float64
Opera                         float64
Movies                        float64
Horror                        float64
Thriller                      float64
Comedy                        float64
Romantic                      float64
Sci-fi                        float64
War                           float64
Fantasy/Fair

Preprocessing the data.
Replacing the NaN values of non-categorical columns with mean of the rest of the data in the column.
Replacing the NaN values of categorical columns with mode of the rest of the data in the column.

Then printing the data after removing NaN values

In [362]:
#print(young[young.columns[young.dtypes!="object"]])
young2=young[young.columns[young.dtypes!="object"]].apply(lambda x: round(x.fillna(x.mean())),axis=0)
#young2 has float and int and replaced by mean
#young2.head()
young3=young[young.columns[young.dtypes=="object"]].apply(lambda x: (x.fillna(x.mode()[0])),axis=0)
#young3 has object type
dfs = [young2,young3]
result = pd.concat( dfs,axis=1)     
result.head(25)  
#result has 
#result=young2.append(young3)
#result.head(25)

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,Punk,"Hiphop, Rap","Reggae, Ska","Swing, Jazz",Rock n roll,Alternative,Latino,"Techno, Trance",Opera,Movies,Horror,Thriller,Comedy,Romantic,Sci-fi,War,Fantasy/Fairy tales,Animated,Documentary,Western,Action,History,Psychology,Politics,Mathematics,Physics,Internet,PC,Economy Management,Biology,Chemistry,Reading,Geography,Foreign languages,Medicine,Law,Cars,Art exhibitions,Religion,"Countryside, outdoors",Dancing,Musical instruments,Writing,Passive sport,Active sport,Gardening,Celebrities,Shopping,Science and technology,Theatre,Fun with friends,Adrenaline sports,Pets,Flying,Storm,Darkness,Heights,Spiders,Snakes,Rats,Ageing,Dangerous dogs,Fear of public speaking,Healthy eating,Daily events,Prioritising workload,Writing notes,Workaholism,Thinking ahead,Final judgement,Reliability,Keeping promises,Loss of interest,Friends versus money,Funniness,Fake,Criminal damage,Decision making,Elections,Self-criticism,Judgment calls,Hypochondria,Empathy,Eating to survive,Giving,Compassion to animals,Borrowed stuff,Loneliness,Cheating in school,Health,Changing the past,God,Dreams,Charity,Number of friends,Waiting,New environment,Mood swings,Appearence and gestures,Socializing,Achievements,Responding to a serious letter,Children,Assertiveness,Getting angry,Knowing the right people,Public speaking,Unpopularity,Life struggles,Happiness in life,Energy levels,Small - big dogs,Personality,Finding lost valuables,Getting up,Interests or hobbies,Parents' advice,Questionnaires or polls,Finances,Shopping centres,Branded clothing,Entertainment spending,Spending on looks,Spending on gadgets,Spending on healthy eating,Age,Height,Weight,Number of siblings,Smoking,Alcohol,Punctuality,Lying,Internet usage,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,5.0,5.0,3.0,1.0,2.0,1.0,5.0,1.0,3.0,3.0,5.0,3.0,5.0,3.0,3.0,3.0,3.0,5.0,3.0,1.0,1.0,1.0,1.0,5.0,3.0,3.0,2.0,1.0,5.0,5.0,1.0,4.0,4.0,2.0,5.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,5,3.0,1.0,3.0,2.0,4.0,2.0,2.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,3.0,5.0,1.0,1.0,3.0,4.0,1.0,3.0,1.0,3.0,1,4.0,5.0,4.0,3.0,2.0,1.0,1.0,1.0,4,2.0,3,3.0,4.0,3.0,4.0,3.0,4.0,3.0,5.0,1.0,1.0,3.0,5.0,5.0,1.0,4.0,5.0,1.0,4.0,3.0,2.0,3.0,4.0,3.0,3.0,4.0,5.0,3.0,3.0,1,3.0,20.0,163.0,48.0,1.0,never smoked,drink a lot,i am always on time,never,few hours a day,female,right handed,college/bachelor degree,no,village,block of flats
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,4.0,1.0,3.0,1.0,4.0,4.0,2.0,1.0,1.0,5.0,2.0,2.0,4.0,3.0,4.0,1.0,3.0,5.0,4.0,1.0,4.0,1.0,3.0,4.0,5.0,2.0,4.0,4.0,5.0,1.0,1.0,4.0,4.0,5.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,2.0,4.0,2.0,5.0,1.0,1.0,1.0,2.0,1.0,1,1.0,3.0,1.0,4.0,3.0,3.0,2.0,4.0,5.0,4.0,1.0,4.0,4.0,3.0,4.0,3.0,2.0,1.0,2.0,5.0,4.0,4.0,1.0,2.0,1,2.0,4.0,3.0,2.0,4.0,4.0,4.0,1.0,3,1.0,3,3.0,4.0,4.0,4.0,4.0,2.0,4.0,2.0,2.0,5.0,4.0,4.0,4.0,1.0,4.0,3.0,5.0,3.0,4.0,5.0,3.0,2.0,3.0,3.0,4.0,1.0,4.0,2.0,5,2.0,19.0,163.0,58.0,2.0,never smoked,drink a lot,i am often early,sometimes,few hours a day,female,right handed,college/bachelor degree,no,city,block of flats
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,4.0,1.0,4.0,3.0,5.0,5.0,5.0,1.0,3.0,5.0,3.0,4.0,4.0,2.0,4.0,2.0,5.0,5.0,2.0,2.0,1.0,1.0,2.0,1.0,5.0,2.0,4.0,2.0,4.0,1.0,1.0,5.0,2.0,5.0,2.0,3.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,1.0,1.0,4.0,2.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,2.0,3.0,1.0,2.0,5.0,3.0,5.0,3.0,4.0,5.0,1.0,5.0,2.0,4.0,1.0,3.0,5.0,4.0,4.0,1.0,5.0,5,5.0,4.0,2.0,5.0,3.0,2.0,5.0,5.0,1,3.0,3,2.0,3.0,4.0,3.0,5.0,3.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,5.0,3.0,1.0,2.0,4.0,1.0,4.0,3.0,4,2.0,20.0,176.0,67.0,2.0,tried smoking,drink a lot,i am often running late,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,4.0,2.0,2.0,1.0,2.0,5.0,1.0,2.0,1.0,5.0,4.0,4.0,3.0,3.0,4.0,3.0,1.0,2.0,5.0,1.0,2.0,4.0,4.0,5.0,4.0,1.0,3.0,1.0,2.0,3.0,3.0,5.0,4.0,4.0,2.0,5.0,1.0,5.0,4.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,4.0,3.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,3.0,5.0,5,5.0,4.0,5.0,5.0,3.0,4.0,4.0,4.0,5.0,3.0,1.0,3.0,4.0,5.0,2.0,1.0,1.0,5.0,5.0,5.0,5.0,4.0,3.0,3.0,1,1.0,2.0,5.0,5.0,5.0,1.0,5.0,4.0,3,3.0,1,1.0,1.0,5.0,3.0,1.0,3.0,3.0,2.0,5.0,5.0,4.0,5.0,3.0,3.0,2.0,2.0,1.0,2.0,1.0,1.0,4.0,2.0,4.0,2.0,4.0,3.0,3.0,4.0,4,1.0,22.0,172.0,59.0,1.0,former smoker,drink a lot,i am often early,only to avoid hurting someone,most of the day,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,2.0,5.0,3.0,2.0,1.0,2.0,4.0,2.0,2.0,5.0,4.0,4.0,5.0,2.0,3.0,3.0,4.0,4.0,3.0,1.0,4.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,5.0,2.0,3.0,3.0,2.0,3.0,1.0,4.0,4.0,1.0,3.0,1.0,3.0,1.0,4.0,3.0,3.0,3.0,2.0,4.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1,2.0,2.0,4.0,3.0,4.0,3.0,1.0,2.0,3.0,5.0,5.0,5.0,4.0,2.0,3.0,3.0,2.0,1.0,3.0,5.0,5.0,5.0,1.0,3.0,1,3.0,3.0,4.0,3.0,5.0,3.0,4.0,5.0,3,3.0,3,3.0,4.0,2.0,3.0,3.0,3.0,3.0,5.0,4.0,2.0,3.0,5.0,5.0,2.0,3.0,5.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,4.0,3.0,4.0,3.0,3.0,2,4.0,20.0,170.0,59.0,1.0,tried smoking,social drinker,i am always on time,everytime it suits me,few hours a day,female,right handed,secondary school,no,village,house/bungalow
5,5.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,5.0,5.0,3.0,4.0,3.0,4.0,4.0,5.0,3.0,1.0,3.0,5.0,5.0,5.0,5.0,2.0,3.0,3.0,4.0,3.0,3.0,2.0,4.0,5.0,3.0,4.0,2.0,3.0,4.0,4.0,1.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,5.0,2.0,2.0,5.0,1.0,5.0,1.0,5.0,4.0,2.0,1.0,2.0,3.0,1.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,1.0,2,2.0,1.0,1.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,1.0,3.0,4.0,3.0,2.0,3.0,1.0,4.0,2.0,5.0,4.0,4.0,1.0,4.0,2,3.0,5.0,5.0,2.0,4.0,3.0,3.0,3.0,3,2.0,3,3.0,4.0,3.0,3.0,4.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,5.0,3.0,4.0,2.0,3.0,3.0,3.0,1.0,4,4.0,20.0,186.0,77.0,1.0,never smoked,never,i am often early,only to avoid hurting someone,few hours a day,male,right handed,secondary school,no,city,block of flats
6,5.0,5.0,5.0,3.0,1.0,2.0,2.0,5.0,3.0,1.0,1.0,3.0,1.0,1.0,2.0,3.0,3.0,5.0,2.0,4.0,2.0,1.0,5.0,3.0,1.0,3.0,5.0,5.0,3.0,1.0,2.0,3.0,3.0,1.0,1.0,1.0,2.0,1.0,3.0,5.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,1.0,1.0,4.0,3.0,2.0,1.0,5.0,3.0,3.0,1.0,3.0,4.0,3.0,5.0,1.0,5.0,1.0,3.0,2.0,1.0,1.0,5,1.0,4.0,1.0,1.0,4.0,3.0,5.0,5.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,4.0,1.0,2.0,2.0,5.0,3.0,5.0,1.0,4.0,1,5.0,5.0,5.0,3.0,2.0,3.0,1.0,5.0,3,3.0,3,4.0,5.0,5.0,4.0,5.0,4.0,3.0,2.0,3.0,3.0,4.0,3.0,3.0,5.0,5.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,5.0,4.0,3.0,1.0,3.0,4.0,1,5.0,20.0,177.0,50.0,1.0,tried smoking,social drinker,i am often early,never,less than an hour a day,female,right handed,secondary school,no,village,house/bungalow
7,5.0,3.0,3.0,2.0,1.0,2.0,2.0,4.0,5.0,1.0,2.0,3.0,2.0,2.0,3.0,1.0,2.0,3.0,2.0,5.0,4.0,4.0,5.0,2.0,3.0,3.0,4.0,4.0,3.0,1.0,3.0,5.0,2.0,3.0,1.0,1.0,5.0,4.0,1.0,2.0,2.0,2.0,3.0,4.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,4.0,5.0,1.0,3.0,3.0,2.0,2.0,4.0,2.0,5.0,3.0,2.0,4.0,3.0,1.0,5,3.0,1.0,2.0,4.0,2.0,3.0,1.0,3.0,2.0,4.0,3.0,3.0,3.0,1.0,4.0,4.0,2.0,1.0,3.0,5.0,3.0,5.0,2.0,1.0,2,3.0,5.0,2.0,2.0,5.0,3.0,2.0,4.0,4,1.0,4,1.0,4.0,3.0,4.0,2.0,4.0,3.0,4.0,3.0,1.0,4.0,5.0,2.0,2.0,4.0,4.0,3.0,4.0,2.0,5.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,3,2.0,19.0,184.0,90.0,1.0,current smoker,drink a lot,i am always on time,sometimes,few hours a day,male,right handed,college/bachelor degree,no,city,house/bungalow
8,5.0,3.0,3.0,1.0,1.0,2.0,4.0,3.0,5.0,5.0,1.0,1.0,2.0,2.0,2.0,3.0,1.0,1.0,1.0,5.0,1.0,5.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,1.0,2.0,1.0,4.0,1.0,1.0,5.0,2.0,1.0,5.0,4.0,1.0,1.0,2.0,3.0,1.0,5.0,5.0,5,2.0,2.0,3.0,4.0,1.0,1.0,2.0,1.0,4.0,2.0,5.0,5.0,4.0,1.0,4.0,2.0,2.0,1.0,4.0,1.0,3.0,2.0,2.0,5.0,1,1.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,4,1.0,2,2.0,2.0,3.0,4.0,4.0,2.0,2.0,4.0,1.0,3.0,3.0,4.0,5.0,4.0,3.0,1.0,5.0,3.0,5.0,5.0,1.0,4.0,3.0,2.0,1.0,3.0,2.0,1.0,3,4.0,18.0,166.0,55.0,1.0,tried smoking,social drinker,i am often early,sometimes,few hours a day,female,right handed,secondary school,no,city,house/bungalow
9,5.0,3.0,2.0,5.0,2.0,2.0,5.0,3.0,5.0,2.0,3.0,2.0,4.0,4.0,4.0,4.0,5.0,1.0,2.0,5.0,2.0,1.0,5.0,5.0,1.0,3.0,4.0,4.0,4.0,1.0,2.0,3.0,2.0,3.0,3.0,1.0,5.0,1.0,4.0,2.0,1.0,4.0,4.0,5.0,1.0,1.0,1.0,4.0,4.0,4.0,5.0,3.0,1.0,4.0,4.0,1.0,2.0,4.0,3.0,5.0,5.0,2.0,2.0,4.0,5.0,4.0,5.0,3.0,4,4.0,3.0,5.0,3.0,3.0,4.0,2.0,2.0,3.0,3.0,5.0,4.0,5.0,3.0,4.0,3.0,1.0,2.0,5.0,5.0,4.0,5.0,1.0,4.0,1,4.0,5.0,4.0,2.0,3.0,4.0,2.0,5.0,3,2.0,3,1.0,4.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,5.0,3.0,5.0,4.0,4.0,1.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,2.0,3.0,2,4.0,19.0,174.0,60.0,3.0,never smoked,drink a lot,i am often running late,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats


Using custom library LabelBinarizer2 the categorical data has been transformed to binary columns and transformed the data into non categorical data

In [384]:
young=result
print(young.shape)
# mark zero values as missing or NaN
#young = young.replace(0, np.NaN)
# drop rows with missing values
print(result.columns[result.isna().any()].tolist())
# summarize the number of rows and columns in the dataset

#print(young.shape)

for i in young.columns[young.dtypes=="object"]:
    #print(young[i].name)
    #li=
    #print(li)
    
    #print((i))
    
    lb_style = LabelBinarizer2()
    lb_results = lb_style.fit_transform(young[i].astype(str))
    #print(type(lb_style.classes_.tolist()))
    column=lb_style.classes_.tolist()
    print(lb_style.classes_)
    print(type(lb_results))
    
    df = pd.DataFrame(data=lb_results, columns=column)
    #print(df)
    young=pd.merge(young,df,left_index=True,right_index=True)
    #young.drop(young[i].name,axis=1)
    del young[young[i].name]
    
    
#     print(lb_style.classes_)
# #     print(type(lb_results))
#     type(pd.DataFrame(lb_results, columns=lb_style.classes_))
print (young.head())
print(young.shape)


(1010, 150)
[]
['current smoker' 'former smoker' 'never smoked' 'tried smoking']
<class 'numpy.ndarray'>
['drink a lot' 'never' 'social drinker']
<class 'numpy.ndarray'>
['i am always on time' 'i am often early' 'i am often running late']
<class 'numpy.ndarray'>
['everytime it suits me' 'never' 'only to avoid hurting someone'
 'sometimes']
<class 'numpy.ndarray'>
['few hours a day' 'less than an hour a day' 'most of the day'
 'no time at all']
<class 'numpy.ndarray'>
['female' 'male']
<class 'numpy.ndarray'>
['left handed' 'right handed']
<class 'numpy.ndarray'>
['college/bachelor degree' 'currently a primary school pupil'
 'doctorate degree' 'masters degree' 'primary school' 'secondary school']
<class 'numpy.ndarray'>
['no' 'yes']
<class 'numpy.ndarray'>
['city' 'village']
<class 'numpy.ndarray'>
['block of flats' 'house/bungalow']
<class 'numpy.ndarray'>
   Music  Slow songs or fast songs  Dance  Folk  Country  Classical music  \
0    5.0                       3.0    2.0   1.0      2

After preprocessing, training and test data is being extracted.
Then feature selection is being performed with "SelectKBest" function where the relevant features would be taken into account. 
KFold is used to check how the 
After that I have used three classifiers to train by adding to the pipeline. 

1) Linear SVC
2) Random Forest
3) Multi Layer Perceptron

But Random Forest model gave better accuracy than the other two.
I have also used ADA boost library to find out the better accuracy of the model.


In [402]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier


young_Y=young["Spending on healthy eating"]
young_X=young.drop("Spending on healthy eating",axis=1)
X_train, X_test, y_train, y_test = train_test_split(young_X, young_Y, test_size=0.33)


MLP=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
svm=LinearSVC()


maximum = 0.0


#print(p)
features = []
#     features.append(('scaler', StandardScaler()))
#features.append(('pca', PCA(n_components=10)))
features.append(('select_best', SelectKBest(chi2,k=p)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))

estimators.append(('rf_classifier' , RandomForestClassifier(criterion='gini', max_depth = 11, random_state=0)))

#estimators.append(('ADABoost', AdaBoostClassifier(base_estimator=model, n_estimators = 300)))
#estimators.append(('MLP', MLP))
model = Pipeline(estimators)

#adaboost_rf_classifier = AdaBoostClassifier(base_estimator=model, n_estimators = 300, random_state=0)
# evaluate pipeline
print("RandomForestClassifier")
model.fit(X_train,y_train)
results = (model.predict(X_test)==y_test).mean()
#print(results.mean())
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(model, X_test, y_test, cv=kfold)

print(results.max())

estimators2=[]
estimators2.append(('feature_union', feature_union))
estimators2.append(('linearSVC', LinearSVC()))
print("LinearSVC")
model2 = Pipeline(estimators2)
model2.fit(X_train,y_train)
#(model.predict(X_test)==y_test).mean()
#print(results.mean())
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results2 = cross_val_score(model2, X_test, y_test, cv=kfold)

print(results2.max())
estimators3=[]
estimators3.append(('feature_union', feature_union))
estimators3.append(('Logistic', LogisticRegression()))
print("LogisticRegression")
model3 = Pipeline(estimators3)
model3.fit(X_train,y_train)
#(model.predict(X_test)==y_test).mean()
#print(results.mean())
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results3 = cross_val_score(model3, X_test, y_test, cv=kfold)

print(results3.max())


RandomForestClassifier
0.484848484848
LinearSVC
0.411764705882
LogisticRegression
0.393939393939
