## Import Requirements and Data


In [690]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [691]:
import pandas as pd
pd.set_option('display.max_columns',100)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import ExtraTreeRegressor, DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.preprocessing import normalize,scale

# Multi Class Classification
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,precision_score,recall_score,f1_score

In [692]:
def regression_(x,y):
    kn=KNeighborsRegressor()
    et=ExtraTreeRegressor()
    gb=GradientBoostingRegressor()
    dt=DecisionTreeRegressor()
    xgbr=XGBRegressor()
       
    algos=[kn,et,gb,dt,xgbr]
    algos_names=['KNeighbors','ExtraTree','GradientBoosting','DecisionTree','XGBRegressor']
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=13)
    
    r_score=[]
    mse=[]
    mae=[]
    
    result=pd.DataFrame(columns=['R_square','MSE','MAE'],index=algos_names)
    
    for algo in algos:
        pred=algo.fit(x_train,y_train).predict(x_test)
        r_score.append(r2_score(y_test,pred))
        mse.append(mean_squared_error(y_test,pred)**.5)
        mae.append(mean_absolute_error(y_test,pred))
    
    result.R_square=r_score
    result.MSE=mse
    result.MAE=mae
    
    return result.sort_values('R_square',ascending=False)

In [693]:
def classification_(X,y): 
    xgbc=XGBClassifier()
    b=BernoulliNB()
    k=KNeighborsClassifier()
    d=DecisionTreeClassifier()
    gbc=GradientBoostingClassifier()
    mn=MultinomialNB()
    rf=RandomForestClassifier()
    
    algos=[xgbc,b,k,d,gbc,mn,rf]
    algos_name=['XGBClassifier','Bernoulli','KNeighbors','DecisionTree','GradientBoosting','MultinominalNB','RandomForest']

    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=101)

    accuracy = []
    precision = []
    recall = []
    f1 = []
   
    result=pd.DataFrame(columns=['AccuracyScore','PrecisionScore','RecallScore','f1_Score'],index=algos_name)
    
    for i in algos:
        predict=i.fit(X_train,y_train).predict(X_test)
        accuracy.append(accuracy_score(y_test,predict))
        precision.append(precision_score(y_test,predict,average='weighted'))
        recall.append(recall_score(y_test,predict,average='weighted'))
        f1.append(f1_score(y_test,predict,average='weighted'))
      
    result.AccuracyScore=accuracy
    result.PrecisionScore=precision
    result.RecallScore=recall
    result.f1_Score=f1
    
    return result.sort_values('AccuracyScore',ascending=False)

In [694]:
df = pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/11 - Three generations seeking romance - Regression & Classification/profiles.csv')

## EDA

In [695]:
df.dropna(how='all', inplace=True) # tum sutunlari nan olan satirlari sildik
df.head(5)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22.0,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,"asian, white",75.0,-1.0,transportation,2012-06-28-20-30,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35.0,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000.0,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38.0,thin,anything,socially,,graduated from masters program,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",,68.0,-1.0,,2012-06-27-09-10,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23.0,thin,vegetarian,socially,,working on college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000.0,student,2012-06-28-14-22,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29.0,athletic,,socially,never,graduated from college/university,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,"asian, black, other",66.0,-1.0,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [696]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9514 entries, 0 to 9513
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          9514 non-null   float64
 1   body_type    8666 non-null   object 
 2   diet         5761 non-null   object 
 3   drinks       9012 non-null   object 
 4   drugs        7215 non-null   object 
 5   education    8459 non-null   object 
 6   essay0       8667 non-null   object 
 7   essay1       8347 non-null   object 
 8   essay2       8049 non-null   object 
 9   essay3       7690 non-null   object 
 10  essay4       7879 non-null   object 
 11  essay5       7814 non-null   object 
 12  essay6       7352 non-null   object 
 13  essay7       7553 non-null   object 
 14  essay8       6361 non-null   object 
 15  essay9       7554 non-null   object 
 16  ethnicity    8565 non-null   object 
 17  height       9514 non-null   float64
 18  income       9514 non-null   float64
 19  job   

In [697]:
df.nunique() # essay lerin hepsi bambaska dummy uygulamak zor silinir/ #last_online da ihtiyacimiz yok

age              53
body_type        12
diet             18
drinks            6
drugs             3
education        31
essay0         8664
essay1         8278
essay2         7921
essay3         7148
essay4         7872
essay5         7808
essay6         7112
essay7         7380
essay8         6290
essay9         7377
ethnicity       112
height           34
income           13
job              21
last_online    6763
location         89
offspring        15
orientation       3
pets             15
religion         45
sex               2
sign             48
smokes            5
speaks         1794
status            4
dtype: int64

In [698]:
drop_cols = df.nunique()[df.nunique()>2000].index.values # 2000 den fazla nunuqiue e sahip olan sutunlar
df.drop(columns=drop_cols, inplace=True)

In [699]:
# Bulunulan yas ve generasyon: ethnicity, sign ile alakali degildir
df.drop(columns=['ethnicity', 'sign'], inplace=True)
# location denendi regresyona etkisi yok
df.drop(columns='location', inplace=True)

## Feature Engineering

In [700]:
df[df.age>70] # 70 den buyuk 1 kisi var silinmeli
df.drop(df[df.age>70].index, inplace=True)

In [701]:
# classiication icin generation sutunu olusturulur
# 0: Millenial, 1 : Gen X-er, 3 : Boomers
df.loc[df[df.age<=70].index, 'generation'] = 'Boomers' 
df.loc[df[df.age<=47].index, 'generation'] = 'Gen X-er' 
df.loc[df[df.age<=32].index, 'generation'] = 'Millennial' 

In [702]:
df["religion"].str.split()[0]

['agnosticism', 'and', 'very', 'serious', 'about', 'it']

In [703]:
df["religion"].str.split().str[0]

0        agnosticism
1        agnosticism
2                NaN
3                NaN
4                NaN
            ...     
9509    christianity
9510         atheism
9511         atheism
9512             NaN
9513             NaN
Name: religion, Length: 9513, dtype: object

In [704]:
# religion lari bulma
df["religion"]=df["religion"].str.split().str[0]

In [705]:
# income da -1 olanlar aslinda nan
df['income'] = df.income.apply(lambda x: np.nan if x==-1.0 else x)

In [706]:
# konusulan dil sayisi
df['speaks'] = df.speaks.apply(lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)

In [707]:
# nan degerlere median atadik
df['speaks'].fillna(df.speaks.median(), inplace=True)

In [708]:
# hayvana sahip mi degil mi
# hata vermemsi icin sadece dolu olan datalarda yaptik
df['has_animals'] = df.pets.str.contains('has') # hayvan varsa yes yoksa no

In [709]:
# cocuk sahibi mi?
df['has_kids'] = df.offspring.str.contains('has')

In [710]:
# diet te category azaltma
df['diet']=df.diet.replace({'mostly':'', 'strictly':''}, regex=True).str.strip()

In [711]:
df.offspring.str.contains('does', na=True) # contatins var mi demek

0       True
1       True
2       True
3       True
4       True
        ... 
9509    True
9510    True
9511    True
9512    True
9513    True
Name: offspring, Length: 9513, dtype: bool

In [712]:
# np.select([cond1, cond2], [exp1, exp2], default=exp3)
df['offspring'] = df.offspring.str.contains('does', na=True)
# offspring te doess olan ve na olan True degerini alir olmayan False

In [713]:
df.head(5)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,height,income,job,offspring,orientation,pets,religion,sex,smokes,speaks,status,generation,has_animals,has_kids
0,22.0,a little extra,anything,socially,never,working on college/university,75.0,,transportation,True,straight,likes dogs and likes cats,agnosticism,m,sometimes,1.0,single,Millennial,False,False
1,35.0,average,other,often,sometimes,working on space camp,70.0,80000.0,hospitality / travel,True,straight,likes dogs and likes cats,agnosticism,m,no,3.0,single,Gen X-er,False,False
2,38.0,thin,anything,socially,,graduated from masters program,68.0,,,True,straight,has cats,,m,no,3.0,available,Gen X-er,True,
3,23.0,thin,vegetarian,socially,,working on college/university,71.0,20000.0,student,True,straight,likes cats,,m,no,2.0,single,Millennial,False,False
4,29.0,athletic,,socially,never,graduated from college/university,66.0,,artistic / musical / writer,True,straight,likes dogs and likes cats,,m,no,1.0,single,Millennial,False,


In [714]:
df.nunique()

age            52
body_type      12
diet            6
drinks          6
drugs           3
education      31
height         34
income         12
job            21
offspring       2
orientation     3
pets           15
religion        9
sex             2
smokes          5
speaks          5
status          4
generation      3
has_animals     2
has_kids        2
dtype: int64

In [715]:
df.isnull().mean(axis=0)

age            0.000000
body_type      0.089036
diet           0.394408
drinks         0.052665
drugs          0.241564
education      0.110796
height         0.000000
income         0.809419
job            0.139809
offspring      0.000000
orientation    0.000000
pets           0.332282
religion       0.341848
sex            0.000000
smokes         0.096394
speaks         0.000000
status         0.000000
generation     0.000000
has_animals    0.332282
has_kids       0.591506
dtype: float64

In [716]:
df.drop(columns=['income'], inplace=True) # cogu bos olan verileri sildik

In [717]:
df.sample(5)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,height,job,offspring,orientation,pets,religion,sex,smokes,speaks,status,generation,has_animals,has_kids
2871,21.0,a little extra,anything,socially,,working on college/university,64.0,entertainment / media,True,straight,likes dogs and likes cats,catholicism,f,sometimes,2.0,single,Millennial,False,
8898,23.0,skinny,vegetarian,socially,never,graduated from college/university,69.0,other,True,straight,likes dogs and has cats,,f,no,3.0,single,Millennial,True,
9184,33.0,,,socially,never,graduated from law school,73.0,political / government,True,straight,,catholicism,m,no,3.0,single,Gen X-er,,
3437,33.0,rather not say,anything,rarely,sometimes,graduated from high school,65.0,rather not say,True,straight,likes dogs and has cats,christianity,f,yes,2.0,single,Gen X-er,True,False
3345,26.0,athletic,,socially,never,graduated from college/university,67.0,science / tech / engineering,True,straight,,christianity,m,no,2.0,single,Millennial,,


In [718]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9513 entries, 0 to 9513
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          9513 non-null   float64
 1   body_type    8666 non-null   object 
 2   diet         5761 non-null   object 
 3   drinks       9012 non-null   object 
 4   drugs        7215 non-null   object 
 5   education    8459 non-null   object 
 6   height       9513 non-null   float64
 7   job          8183 non-null   object 
 8   offspring    9513 non-null   bool   
 9   orientation  9513 non-null   object 
 10  pets         6352 non-null   object 
 11  religion     6261 non-null   object 
 12  sex          9513 non-null   object 
 13  smokes       8596 non-null   object 
 14  speaks       9513 non-null   float64
 15  status       9513 non-null   object 
 16  generation   9513 non-null   object 
 17  has_animals  6352 non-null   object 
 18  has_kids     3886 non-null   object 
dtypes: boo

## Predict Age

In [719]:
y1=df.age
x1=df.drop(columns=['age', 'generation'])
x1=pd.get_dummies(x1, drop_first=True)
regression_(x1,y1)



Unnamed: 0,R_square,MSE,MAE
XGBRegressor,0.8614,3.502847,2.87144
GradientBoosting,0.861396,3.502891,2.870733
DecisionTree,0.721762,4.96304,3.86784
ExtraTree,0.718748,4.989848,3.852864
KNeighbors,0.500169,6.651984,4.31876


## Predict Generation

In [720]:
y2=df.generation
x2=df.drop(columns=['age','generation'])
x2=pd.get_dummies(x2, drop_first=True)
classification_(x2,y2)

Unnamed: 0,AccuracyScore,PrecisionScore,RecallScore,f1_Score
GradientBoosting,0.675775,0.63392,0.675775,0.622931
XGBClassifier,0.67525,0.633025,0.67525,0.617137
MultinominalNB,0.671046,0.640537,0.671046,0.646233
Bernoulli,0.659485,0.628803,0.659485,0.635558
RandomForest,0.658434,0.618991,0.658434,0.62564
KNeighbors,0.602733,0.568235,0.602733,0.578409
DecisionTree,0.572254,0.576694,0.572254,0.574405
