In [179]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [180]:
df=pd.read_csv('/kaggle/input/food-preferences/Food_Preference.csv')
#showing the data
df

Unnamed: 0,Timestamp,Participant_ID,Gender,Nationality,Age,Food,Juice,Dessert
0,2019/05/07 2:59:13 PM GMT+8,FPS001,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,2019/05/07 2:59:45 PM GMT+8,FPS002,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,2019/05/07 3:00:05 PM GMT+8,FPS003,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,2019/05/07 3:00:11 PM GMT+8,FPS004,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,2019/05/07 3:02:50 PM GMT+8,FPS005,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...,...,...
283,2019/05/10 9:24:00 AM GMT+8,FPS284,Male,Indian,27,Western Food,Fresh Juice,Yes
284,2019/05/10 9:32:54 AM GMT+8,FPS285,Male,Indian,24,Traditional food,Fresh Juice,Yes
285,2019/05/10 12:09:17 PM GMT+8,FPS286,Male,Indian,25,Traditional food,Fresh Juice,Yes
286,2019/05/10 12:52:17 PM GMT+8,FPS287,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [181]:
#dropping the unnecessary columnsZ
df.drop(['Timestamp','Participant_ID'],axis=1,inplace=True)

In [182]:
df

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...
283,Male,Indian,27,Western Food,Fresh Juice,Yes
284,Male,Indian,24,Traditional food,Fresh Juice,Yes
285,Male,Indian,25,Traditional food,Fresh Juice,Yes
286,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [183]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Gender       284 non-null    object
 1   Nationality  288 non-null    object
 2   Age          288 non-null    int64 
 3   Food         288 non-null    object
 4   Juice        288 non-null    object
 5   Dessert      288 non-null    object
dtypes: int64(1), object(5)
memory usage: 13.6+ KB


In [184]:
#Dropping the missing valuesZ
df.dropna(axis=0,inplace=True)


In [185]:
df.isna().sum()

Gender         0
Nationality    0
Age            0
Food           0
Juice          0
Dessert        0
dtype: int64

In [186]:
#resetting index
df.reset_index(drop=True,inplace=True)

In [187]:
df

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...
279,Male,Indian,27,Western Food,Fresh Juice,Yes
280,Male,Indian,24,Traditional food,Fresh Juice,Yes
281,Male,Indian,25,Traditional food,Fresh Juice,Yes
282,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [188]:
df['Age']

0      24
1      22
2      31
3      25
4      27
       ..
279    27
280    24
281    25
282    27
283    27
Name: Age, Length: 284, dtype: int64

In [189]:
#cutting the column into 2 different parts
df['Age']=pd.qcut(df['Age'],q=2,labels=[0,1])

In [190]:
age_bins

0      0
1      0
2      1
3      0
4      0
      ..
279    0
280    0
281    0
282    0
283    0
Name: Age, Length: 284, dtype: category
Categories (2, int64): [0 < 1]

In [191]:
pd.concat([df['Age'],age_bins],axis=1)

Unnamed: 0,Age,Age.1
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
...,...,...
279,0,0
280,0,0
281,0,0
282,0,0


# Encoding

In [192]:
categorical_features=['Gender','Nationality','Food','Juice','Dessert']

In [193]:
#creating a function
def get_uniques(df,columns):
    return {column: list(df[column].unique()) for column in columns}

In [194]:
get_uniques(df,categorical_features)

{'Gender': ['Male', 'Female'],
 'Nationality': ['Indian',
  'Pakistani ',
  'Tanzanian',
  'Indonesia',
  'Pakistan',
  'Maldivian ',
  'MY',
  'Malaysian',
  'Malaysian ',
  'Indonesian ',
  'Maldivian',
  'MALAYSIAN',
  'Malaysia ',
  'Pakistani',
  'Canadian',
  'Nigerian ',
  'Algerian ',
  'Korean ',
  'Seychellois',
  'Indonesain',
  'Indonesian',
  'Malaysia',
  'Japan',
  'China',
  'Mauritian',
  'Yemen'],
 'Food': ['Traditional food', 'Western Food'],
 'Juice': ['Fresh Juice', 'Carbonated drinks'],
 'Dessert': ['Maybe', 'Yes', 'No']}

In [195]:
binary_features=['Gender','Food','Juice']
ordinal_features=['Dessert']
nominal_features=['Nationality']

In [196]:
#creating a function for binary encoding
def binary_encode(df,column,positive_label):
    df=df.copy()
    df[column]=df[column].apply(lambda x:1 if x==positive_label else 0)
    return df

In [197]:
#ordinal encode
def ordinal_encode(df,column,ordering):
    df=df.copy()
    df[column]=df[column].apply(lambda x:ordering.index(x))
    return df

In [198]:
#onehot_encode
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column])
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [199]:
df=binary_encode(df,'Gender','Male')
df=binary_encode(df,'Food','Traditional food')
df=binary_encode(df,'Juice','Fresh Juice')

In [200]:
df

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,1,Indian,0,1,1,Maybe
1,0,Indian,0,0,0,Yes
2,1,Indian,1,0,1,Maybe
3,0,Indian,0,1,1,Maybe
4,1,Indian,0,1,1,Maybe
...,...,...,...,...,...,...
279,1,Indian,0,0,1,Yes
280,1,Indian,0,1,1,Yes
281,1,Indian,0,1,1,Yes
282,1,Indian,0,1,1,Yes


In [201]:
dessert_ordering=['No','Maybe','Yes']
df=ordinal_encode(df,'Dessert',dessert_ordering)


In [202]:
df

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,1,Indian,0,1,1,1
1,0,Indian,0,0,0,2
2,1,Indian,1,0,1,1
3,0,Indian,0,1,1,1
4,1,Indian,0,1,1,1
...,...,...,...,...,...,...
279,1,Indian,0,0,1,2
280,1,Indian,0,1,1,2
281,1,Indian,0,1,1,2
282,1,Indian,0,1,1,2


In [203]:
df=onehot_encode(df,'Nationality')

In [204]:
df

Unnamed: 0,Gender,Age,Food,Juice,Dessert,Algerian,Canadian,China,Indian,Indonesain,...,Maldivian,Maldivian.1,Mauritian,Nigerian,Pakistan,Pakistani,Pakistani.1,Seychellois,Tanzanian,Yemen
0,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,1,0,0,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
280,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
281,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
282,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
#Scaling and Spitting
y=df['Age']
x=df.drop('Age',axis=1)

In [208]:
scaler=MinMaxScaler()
x=scaler.fit_transform(x)

In [209]:
x

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [210]:
train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [211]:
#Training
model=LogisticRegression()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.6627906976744186