In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import iplot

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")
pd.set_option('future.no_silent_downcasting', True)
pd.options.mode.copy_on_write = "warn"

In [6]:
def add_line(fig, x0 = 0, y0 = 0, x1 = 0, y1 = 0,
             line_color = "#00DFA2", font_color = "#3C486B",
             xposition = "right", text = "Text"):
    fig.add_shape(type='line',
                  x0 = x0,
                  y0 = y0,
                  x1 = x1,
                  y1 = y1 + 2,
                  line = {
                      "color" : line_color,
                      "width" : 3,
                      "dash" : "dashdot"
                  },
                  label={
                      "text" : f"\t{text}: {x1: 0.1f}\t".expandtabs(5),
                      "textposition": "end",
                      "yanchor" :"top",
                      "xanchor" :xposition,
                      "textangle" :0,
                      "font": {
                          "size": 14,
                          "color" :font_color,
                          "family" : "arial"




























                      },
                  }
                 )

In [3]:
def custome_layout(title_size = 28, hover_font_size = 16, showlegend = False):
    fig.update_layout(
    showlegend = showlegend,
    title = {
        "font" :{
            "size" :title_size,
            "family" : "tahoma"
        }
    },
    hoverlabel = {
        "bgcolor" :"#111",
        "font_size" : hover_font_size,
        "font_family" :"arial"
    }

)

In [7]:
df = pd.read_csv("/content/adult.csv")

In [8]:
fig = px.box(
    x = df["occupation"], y = df["income"],
    title= "income Vs.occupation",
    template="plotly_dark",
    labels={"x": "occupation", "y" :"income"}
)

custome_layout(hover_font_size=13)

iplot(fig)

In [9]:
fig = px.box(
    x = df["education"], y = df["income"],
    title= "income Vs.education",
    template="plotly_dark",
    labels={"x": "education", "y" :"income"}
)

custome_layout(hover_font_size=13)

iplot(fig)

In [10]:

salary_by_gender = df.groupby("gender")["hours-per-week"].mean().sort_values(ascending=False)
salary_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,hours-per-week
gender,Unnamed: 1_level_1
Male,42.41
Female,36.46


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28855 entries, 0 to 28854
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              28855 non-null  int64  
 1   workclass        28855 non-null  object 
 2   fnlwgt           28855 non-null  int64  
 3   education        28855 non-null  object 
 4   educational-num  28855 non-null  int64  
 5   marital-status   28855 non-null  object 
 6   occupation       28854 non-null  object 
 7   relationship     28854 non-null  object 
 8   race             28854 non-null  object 
 9   gender           28854 non-null  object 
 10  capital-gain     28854 non-null  float64
 11  capital-loss     28854 non-null  float64
 12  hours-per-week   28854 non-null  float64
 13  native-country   28854 non-null  object 
 14  income           28854 non-null  object 
dtypes: float64(3), int64(3), object(9)
memory usage: 3.3+ MB


In [12]:
df.sample(10, random_state=15)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
2702,31,Self-emp-not-inc,190650,Masters,14,Married-civ-spouse,Sales,Husband,Asian-Pac-Islander,Male,0.0,0.0,40.0,?,<=50K
17867,51,Private,71046,Some-college,10,Divorced,Exec-managerial,Unmarried,White,Male,0.0,0.0,45.0,Scotland,<=50K
22638,29,Private,225024,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
20455,17,?,114798,11th,7,Never-married,?,Own-child,White,Female,0.0,0.0,18.0,United-States,<=50K
28314,40,Private,213019,HS-grad,9,Divorced,Sales,Unmarried,White,Female,0.0,0.0,30.0,United-States,<=50K
12563,29,Private,118503,HS-grad,9,Divorced,Machine-op-inspct,Not-in-family,White,Male,0.0,0.0,50.0,United-States,<=50K
22667,37,Private,301568,12th,8,Never-married,Other-service,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K
26875,45,Private,117409,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
9061,20,Private,190916,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K
23591,49,Private,147032,HS-grad,9,Married-civ-spouse,Other-service,Wife,Asian-Pac-Islander,Female,0.0,0.0,8.0,Philippines,<=50K


In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,28855.0,38.634344,13.74288,17.0,28.0,37.0,48.0,90.0
fnlwgt,28855.0,189756.508473,105450.076329,13492.0,117527.0,178109.0,238410.0,1490400.0
educational-num,28855.0,10.073575,2.559413,1.0,9.0,10.0,12.0,16.0
capital-gain,28854.0,1070.827892,7463.025745,0.0,0.0,0.0,0.0,99999.0
capital-loss,28854.0,88.328273,403.260391,0.0,0.0,0.0,0.0,4356.0
hours-per-week,28854.0,40.442608,12.384877,1.0,40.0,40.0,45.0,99.0


In [14]:
df.isna().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,1
relationship,1
race,1
gender,1


In [15]:
df[df["age"].isna()]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income


In [16]:
df.dropna(inplace=True)


df.isna().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [17]:
df.duplicated().sum()

np.int64(18)

In [18]:
df[df.duplicated()].head(15)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
864,24,Private,194630,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0.0,0.0,35.0,United-States,<=50K
11189,37,Private,52870,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
11212,29,Private,36440,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K
13848,30,Private,180317,Assoc-voc,11,Divorced,Machine-op-inspct,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
15960,18,Self-emp-inc,378036,12th,8,Never-married,Farming-fishing,Own-child,White,Male,0.0,0.0,10.0,United-States,<=50K
18476,39,Private,184659,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
19399,19,Private,130431,5th-6th,3,Never-married,Farming-fishing,Not-in-family,White,Male,0.0,0.0,36.0,Mexico,<=50K
19726,41,Private,116391,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
20606,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0.0,0.0,40.0,Mexico,<=50K
21162,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0.0,0.0,40.0,Mexico,<=50K


In [19]:

df.drop_duplicates(inplace=True)


df.reset_index(inplace=True, drop=True)

In [20]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [21]:
mean_of_age = df["age"].mean()
median_of_age = df["age"].median()

In [22]:
fig = px.box(
    y=df["age"],
    title= "Ages Distribution",
    template="plotly_dark",
    labels={"y" :"Age"},
)
custome_layout()

iplot(fig)

In [23]:
fig = px.histogram(
    df["age"],
    nbins=25,
    title= "Age Distribution",
    template="plotly_dark",
    labels={"value" :"Age"}
)

custome_layout()
fig.update_traces(
    textfont = {
        "size" : 20,
        "family" :"tahoma",
        "color": "#fff"
    },
    hovertemplate = "Age: %{x}<br>Frequency: %{y}",
    marker=dict(line=dict(color='#000', width=0.1))
)
add_line(fig=fig, x0=mean_of_age, y0=0, x1=mean_of_age, y1=30+2, line_color="#E97777",font_color="#E97777",
         text="Mean", xposition="left")


add_line(fig=fig, x0=median_of_age, y0=0, x1=median_of_age, y1=30+2, line_color="#FFE5F1",
         font_color="#fff", xposition="right", text="Median")

iplot(fig)

In [24]:
gender = df["gender"].value_counts(normalize=1) * 100
gender.apply(lambda x: f"{x:0.2f}%")

Unnamed: 0_level_0,proportion
gender,Unnamed: 1_level_1
Male,66.88%
Female,33.12%


In [25]:
fig = px.bar(data_frame = gender,
             x = gender.index,
             y = gender,
             color = gender.index,
             title = "Gender Frequency (PCT)",
             color_discrete_sequence=["#45FFCA", "#FF9B9B"],
             labels= {"index" :"Gender", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text = gender.apply(lambda x: f"{x:0.0f}%"))

custome_layout()


fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Percentage: %{y:0.1f}%",
)

iplot(fig)

In [26]:
education = df["education"].value_counts(normalize=1) * 100
education.apply(lambda x: f"{x:0.2f}%")

Unnamed: 0_level_0,proportion
education,Unnamed: 1_level_1
HS-grad,32.55%
Some-college,22.43%
Bachelors,16.35%
Masters,5.56%
Assoc-voc,4.13%
11th,3.77%
Assoc-acdm,3.21%
10th,2.83%
7th-8th,1.95%
Prof-school,1.63%


In [27]:
fig = px.bar(data_frame = education,
             x = education.index,
             y = education,
             color = education.index,
             title = "Education Frequency (PCT)",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text = education.apply(lambda x: f"{x:0.0f}%"))

custome_layout()


fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Education: %{x}<br>Percentage: %{y:0.1f}%",
)
iplot(fig)

In [28]:
df["experience"] = df["age"] - df["educational-num"]
mean_of_experience = df["experience"].mean()
median_of_experience = df["experience"].median()

fig = px.box(
    y=df["experience"],
    title= "Experience Distribution",
    template="plotly_dark",
    labels={"y" :"Experience"},
)
custome_layout()
iplot(fig)

fig = px.histogram(
    df["experience"],
    nbins=25,
    title= "Experience Distribution",
    template="plotly_dark",
    labels={"value" :"Experience"}
)

custome_layout()
fig.update_traces(
    textfont = {
        "size" : 20,
        "family" :"tahoma",
        "color": "#fff"
    },
    hovertemplate = "Experience: %{x}<br>Frequency: %{y}",
    marker=dict(line=dict(color='#000', width=0.1))
)



iplot(fig)

In [29]:
fig = px.box(
    x = df["educational-num"], y = df["income"],
    title= "income Vs.educational-num",
    template="plotly_dark",
    labels={"x": "educational-num", "y" :"income"}
)

custome_layout(hover_font_size=13)

iplot(fig)

In [30]:
fig = px.box(
    y=df["hours-per-week"],
    title= "hours-per-week",
    template="plotly_dark",
    labels={"y" :"hours-per-week"},
)
custome_layout()

iplot(fig)

In [31]:
fig = px.box(
    x = df["occupation"], y = df["income"],
    title= "income Vs.occupation",
    template="plotly_dark",
    labels={"x": "occupation", "y" :"income"}
)

custome_layout(hover_font_size=13)

iplot(fig)

In [32]:
fig = px.box(
    x = df["education"], y = df["income"],
    title= "income Vs.education",
    template="plotly_dark",
    labels={"x": "education", "y" :"income"}
)

custome_layout(hover_font_size=13)

iplot(fig)

In [33]:

salary_by_gender = df.groupby("gender")["hours-per-week"].mean().sort_values(ascending=False)
salary_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,hours-per-week
gender,Unnamed: 1_level_1
Male,42.42
Female,36.47


In [34]:
loss_by_gender = df.groupby("gender")["capital-loss"].mean().sort_values(ascending=False)
loss_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,capital-loss
gender,Unnamed: 1_level_1
Male,102.67
Female,59.53


In [35]:


win_by_gender = df.groupby("gender")["capital-gain"].mean().sort_values(ascending=False)
win_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,capital-gain
gender,Unnamed: 1_level_1
Male,1306.27
Female,597.37


In [39]:
salary_by_gender = df.groupby("gender")["income_numeric"].mean().sort_values(ascending=False)
salary_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,income_numeric
gender,Unnamed: 1_level_1
Male,0.3
Female,0.11


In [40]:
fig = px.bar(data_frame = salary_by_gender,
             x = salary_by_gender.index,
             y = salary_by_gender,
             color = salary_by_gender.index,
             title = "AVG Salary By Gender👨👩",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s"
            )

custome_layout()


fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Average Salary: $%{y:0.4s}",
)

iplot(fig)

In [41]:
salary_by_education = df.groupby("education")["income_numeric"].mean().sort_values(ascending=False)
salary_by_education.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,income_numeric
education,Unnamed: 1_level_1
Prof-school,0.74
Doctorate,0.72
Masters,0.55
Bachelors,0.41
Assoc-acdm,0.26
Assoc-voc,0.24
Some-college,0.19
HS-grad,0.16
7th-8th,0.07
12th,0.07


In [42]:
fig = px.bar(data_frame = salary_by_education,
             x = salary_by_education.index,
             y = salary_by_education,
             color = salary_by_education.index,
             title = "AVG Salary Via Education Level👨👩",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s"
            )

custome_layout()


fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },    hovertemplate = "Education Level: %{x}<br>Average Salary: $%{y:0.4s}",
)

iplot(fig)


In [44]:
def groupping_exp(exp):
    if exp >= 0 and exp <= 5:
        return "0-5 years"
    elif exp > 5 and exp <= 10:
        return "6-10 years"
    elif exp > 10 and exp <= 15:
        return "11-15 years"
    elif exp > 15 and exp <= 20:
        return "16-20 years"
    else:
        return "20+"

In [45]:
salary_by_exp = df.groupby(df["experience"].apply(groupping_exp))["income_numeric"].mean().sort_values(ascending=False)
salary_by_exp.apply(lambda x: f"${x:,.2f}")

Unnamed: 0_level_0,income_numeric
experience,Unnamed: 1_level_1
20+,$0.31
16-20 years,$0.18
11-15 years,$0.05
6-10 years,$0.01
0-5 years,$0.00


In [46]:
fig = px.bar(data_frame = salary_by_exp,
             x = salary_by_exp.index,
             y = salary_by_exp,
             color = salary_by_exp.index,
             title = "AVG Salary By Gender👨👩",
             color_discrete_sequence=["#45FFCA", "#D09CFA", "#FF9B9B", "#F875AA", "#3EDBF0"],
             labels= {"index" :"Education", "y": "Frequency in PCT(%)"},
             template="plotly_dark",
             text_auto = "0.4s"
            )

custome_layout()


fig.update_traces(
    textfont = {
        "size" : 16,
        "family" :"arial",
        "color": "#222"
    },
    hovertemplate = "Gender: %{x}<br>Average Salary: $%{y:0.4s}",
)
iplot(fig)

In [47]:
correlation = df.corr(numeric_only=True)

fig = px.imshow(
    correlation,
    template = "plotly_dark",
    text_auto = "0.2f",
    aspect=1,
    color_continuous_scale="orrd",
    title= "Correlations Between Data"
)

fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)

In [48]:
fig = px.scatter_matrix(
    df,
    dimensions=df.select_dtypes(include="number").columns,
    height=800,
    color="income_numeric",
    opacity=0.65,
    title= "Relationships Between Numerical Data",
    template="plotly_dark"

)

fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)

In [49]:
df_encoded = pd.get_dummies(df, columns=["education"], drop_first=True) *1
df_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,...,0,0,0,0,0,0,0,0,0,0
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,0,0,0,0,0,1,0,0,0,0
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,...,0,1,0,0,0,0,0,0,0,0
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,...,0,0,0,0,0,0,0,0,0,1
4,18,?,103497,10,Never-married,?,Own-child,White,Female,0.0,...,0,0,0,0,0,0,0,0,0,1


In [50]:
X = df_encoded.drop(columns=["income", "income_numeric", "gender"])
y = df_encoded["income_numeric"]

In [51]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,18,?,103497,10,Never-married,?,Own-child,White,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)

non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
X_test_encoded = pd.get_dummies(X_test, columns=non_numeric_cols, drop_first=True) * 1

In [53]:
df['income_numeric'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)
display(df.head())

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,experience,income_numeric
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K,18,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,29,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K,16,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K,34,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K,8,0


In [54]:
salary_by_gender = df.groupby("gender")["income_numeric"].mean().sort_values(ascending=False)
salary_by_gender.apply(lambda x: f"{x:,.2f}")

Unnamed: 0_level_0,income_numeric
gender,Unnamed: 1_level_1
Male,0.3
Female,0.11


In [55]:
kf = KFold(n_splits=10, shuffle=True, random_state=30)

In [56]:
rf = RandomForestRegressor(n_estimators=500, random_state=11)

In [60]:

non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
print("Non-numeric columns to encode:", non_numeric_cols)


X_encoded = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True) * 1
X_encoded.head()

Non-numeric columns to encode: Index(['workclass', 'marital-status', 'occupation', 'relationship', 'race',
       'native-country'],
      dtype='object')


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,experience,education_11th,education_12th,education_1st-4th,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0.0,0.0,40.0,18,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0.0,0.0,50.0,29,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0.0,0.0,40.0,16,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688.0,0.0,40.0,34,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,0.0,0.0,30.0,8,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [61]:

rf.fit(X_encoded, y)

In [66]:


import numpy as np

missing_cols = set(X_encoded.columns) - set(X_test_encoded.columns)
for c in missing_cols:
    X_test_encoded[c] = 0

X_test_encoded = X_test_encoded[X_encoded.columns]


predicted_salary = np.round(rf.predict(X_test_encoded))

In [67]:
d = {
    "Actual_Salary" : y_test,
    "Predicted_Salary" : predicted_salary,
    "error": predicted_salary - y_test
}
predected_df = pd.DataFrame(d)
predected_df.head()

Unnamed: 0,Actual_Salary,Predicted_Salary,error
26203,0,0.0,0.0
8243,1,1.0,0.0
23356,0,0.0,0.0
26166,0,0.0,0.0
7445,0,0.0,0.0


In [68]:
score = r2_score(y_test, predicted_salary)*100
print(f"Model Score: {np.round(score, 2)}%")

Model Score: 99.81%


In [69]:
rmse = np.sqrt(mean_squared_error(y_test, predicted_salary))
print(f"Error Ratio: {rmse:.3f}")

Error Ratio: 0.019


In [70]:
fig = px.scatter(
    predected_df,
    x = "Actual_Salary",
    y = "Predicted_Salary",
    color = "error",
    opacity=0.8,
    title= "Predicted Vs. Actual",
    template="plotly_dark",
    trendline="ols"

)

fig.update_layout(
    title = {
        "font" :{
            "size" : 28,
            "family" : "tahoma"
        }
    }
)
iplot(fig)


In [71]:
pd.to_pickle(rf, "random_forest_regressor_salary_predictor_v1.pkl")

In [76]:


import numpy as np
score = rf.score(X_encoded, y)*100
print(f"Model Score: {np.round(score, 2)}%")

Model Score: 92.33%


In [77]:
df['income_numeric'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)
display(df.head())

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,experience,income_numeric
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K,18,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,29,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K,16,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K,34,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K,8,0


In [79]:


import numpy as np
predicted_salary = np.round(rf.predict(X_test_encoded))

In [80]:
d = {
    "Actual_Salary" : y_test,
    "Predicted_Salary" : predicted_salary,
    "error": predicted_salary - y_test
}
predected_df = pd.DataFrame(d)
predected_df.head()

Unnamed: 0,Actual_Salary,Predicted_Salary,error
26203,0,0.0,0.0
8243,1,1.0,0.0
23356,0,0.0,0.0
26166,0,0.0,0.0
7445,0,0.0,0.0


In [81]:
import numpy as np
score = rf.score(X_encoded, y)*100
print(f"Model Score: {np.round(score, 2)}%")
missing_cols = set(X_encoded.columns) - set(X_test_encoded.columns)
for c in missing_cols:
    X_test_encoded[c] = 0
# Ensure the order of columns in the test set is the same as in the training set
X_test_encoded = X_test_encoded[X_encoded.columns]


predicted_salary = np.round(rf.predict(X_test_encoded))

Model Score: 92.33%


In [84]:


import numpy as np
scores = cross_val_score(rf, X_encoded, y, cv=kf)
print(f"Cross Validation Score: {np.mean(scores)*100:0.2f}%")

Cross Validation Score: 43.80%
