**IMPORT LIBRARY**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore')


**IMPORT DATA**

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

worksheet = gc.open('adult').sheet1
dataset = worksheet.get_all_values()

df = pd.DataFrame.from_records(dataset)

df.columns = df.iloc[0]
df = df.iloc[1:]

df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
2,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
3,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
4,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48838,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48839,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48840,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48841,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 1 to 48842
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  object
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  object
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  object
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  object
 11  capital-loss     48842 non-null  object
 12  hours-per-week   48842 non-null  object
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: object(15)
memory usage: 5.6+ MB


In [None]:
df.describe()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842,48842
unique,74,9,28523,16,16,7,15,6,5,2,123,99,96,42,2
top,36,Private,203488,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
freq,1348,33906,21,15784,15784,22379,6172,19716,41762,32650,44807,46560,22803,43832,37155


**SPLIT AND VISUALITATION**

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify = df['income'])

In [None]:
#cat_features = [c for c in df_train.columns if df_train[c].dtype == 'object']
#for c in cat_features:
    #sns.countplot(df_train[c], hue=df_train['income'])
    #plt.xticks(rotation=90)
    #plt.show()

In [None]:
# num_features = [c for c in df_train.columns if df_train[c].dtype != 'object']
# for c in num_features:
#     f, axis = plt.subplots(1,2, figsize=(20, 5))
#     sns.distplot(df_train[c], ax=axis[0], kde=True)
#     sns.boxplot(df_train['income'], df_train[c], ax=axis[1])
#     plt.show()

**ENCODING**

In [None]:
x_train = df_train.drop(columns=['income'])
x_test = df_test.drop(columns=['income'])
y_train = pd.get_dummies(df_train['income'], drop_first=True)
y_test = pd.get_dummies(df_test['income'], drop_first=True)

In [None]:
pip install --upgrade category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [None]:
from category_encoders import MEstimateEncoder

encoder = MEstimateEncoder(cols=['workclass', 'education', 'marital-status', 'occupation', 'native-country'], m=5.0)

encoder.fit(x_train, y_train)

x_train_encoded = encoder.transform(x_train)
x_test_encoded = encoder.transform(x_test)

x_train_encoded = pd.get_dummies(x_train_encoded,drop_first=True)
x_test_encoded = pd.get_dummies(x_test_encoded,drop_first=True)

**MODELLING**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report

In [None]:
# rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=0)
# rf.fit(x_train_encoded, y_train)

# print(classification_report(y_test, rf.predict(x_test_encoded)))
# print('Train score : ', f1_score(y_train, rf.predict(x_train_encoded)))
# print('test score : ', f1_score(y_test, rf.predict(X_test_encoded)))

In [None]:
gbc = GradientBoostingClassifier(n_estimators=300, max_depth=5,random_state=0)
gbc.fit(x_test_encoded, y_test)

print(classification_report(y_test, gbc.predict(x_test_encoded)))
print('Train score : ', f1_score(y_train, gbc.predict(x_train_encoded)))
print('test score : ', f1_score(y_test, gbc.predict(x_test_encoded)))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      7431
           1       0.83      0.63      0.72      2338

    accuracy                           0.88      9769
   macro avg       0.86      0.79      0.82      9769
weighted avg       0.88      0.88      0.87      9769



In [None]:
# adbc = AdaBoostClassifier(n_estimators=500, random_state=0)
# adbc.fit(X_train_encoded, y_train)

# print(classification_report(y_test, gbc.predict(X_test_encoded)))
# print('Train score : ', f1_score(y_train, adbc.predict(X_train_encoded)))
# print('test score : ', f1_score(y_test, adbc.predict(X_test_encoded)))