In [26]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,HuberRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

In [27]:
#loading the dataset
df=pd.read_csv('/kaggle/input/korea-income-and-welfare/Korea Income and Welfare.csv')
#showing the dataset
df

Unnamed: 0,id,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,10101,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,10101,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,10101,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,10101,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,10101,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,98000701,2014,10,5,11600.0,6,1,1967,5,1,1,874,1,
92853,98000701,2015,11,5,8327.0,6,1,1967,5,1,1,874,1,
92854,98000701,2016,12,5,7931.0,6,1,1967,5,1,1,874,1,
92855,98000701,2017,13,5,8802.0,5,1,1967,5,1,1,874,1,


In [28]:
#getting preliminary information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92857 entries, 0 to 92856
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  92857 non-null  int64  
 1   year                92857 non-null  int64  
 2   wave                92857 non-null  int64  
 3   region              92857 non-null  int64  
 4   income              92857 non-null  float64
 5   family_member       92857 non-null  int64  
 6   gender              92857 non-null  int64  
 7   year_born           92857 non-null  int64  
 8   education_level     92857 non-null  int64  
 9   marriage            92857 non-null  int64  
 10  religion            92857 non-null  int64  
 11  occupation          92857 non-null  object 
 12  company_size        92857 non-null  object 
 13  reason_none_worker  92857 non-null  object 
dtypes: float64(1), int64(10), object(3)
memory usage: 9.9+ MB


In [29]:
def onehot_encode(df,column,prefix):
    df=df.copy()
    #creating dummies column
    dummies=pd.get_dummies(df[column],prefix=prefix)
    #creating concat function
    df=pd.concat([df,dummies],axis=1)
    #dropping the column after one hot encode
    df=df.drop(column,axis=1)
    for column in df.columns:
        if df[column].dtypes=='bool':
            df[column]=df[column].astype(int)
    return df

In [45]:
#Preprocessing the dataset
def preprocess_inputs(df):
    #creating copy of the dataset
    df=df.copy()

    #dropping the id  column
    df=df.drop('id',axis=1)

    #Encoding the missing value
    df=df.replace(' ',np.NaN)
    
  
    
    nominal_features=[('region','reg'),('marriage','mar'),('religion','rel'),('occupation','occ'),
                     ('reason_none_worker','rsn')]
    for column,prefix in nominal_features:
        df=onehot_encode(df,column,prefix=prefix)

    #filling company size with zero

    df['company_size']=df['company_size'].fillna(0)
    
    #splitting between target and features columns

    y=df['income']
    x=df.drop('income',axis=1)

    #train_test_split

    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123,shuffle=True)

    scaler=StandardScaler()
    scaler.fit(x_train)

    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)




    
    
    return x_train,x_test,y_train,y_test

In [46]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

Unnamed: 0,year,wave,family_member,gender,year_born,education_level,company_size,reg_1,reg_2,reg_3,...,rsn_11,rsn_2,rsn_3,rsn_4,rsn_5,rsn_6,rsn_7,rsn_8,rsn_9,rsn_99
34107,-1.128235,-1.128235,-0.374182,-0.649653,1.251899,0.290121,2.447824,-0.428696,-0.513361,2.180871,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
13407,-1.377433,-1.377433,1.170822,-0.649653,0.501272,0.888054,0.249484,2.332656,-0.513361,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
24021,0.366956,0.366956,-1.146684,1.539284,-1.562953,-1.503676,-0.692662,-0.428696,-0.513361,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
72605,0.117758,0.117758,-1.146684,1.539284,-1.375297,-1.503676,-0.692662,-0.428696,-0.513361,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
27945,-1.128235,-1.128235,1.170822,-0.649653,0.751481,0.888054,2.447824,-0.428696,1.947947,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63206,-0.380639,-0.380639,0.398320,1.539284,0.939138,0.290121,-0.064565,-0.428696,-0.513361,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
61404,-1.128235,-1.128235,-0.374182,-0.649653,-0.437012,-0.905744,-0.378613,-0.428696,-0.513361,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
17730,0.366956,0.366956,-1.146684,-0.649653,1.877422,1.485986,2.447824,-0.428696,-0.513361,2.180871,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,-0.172252,-0.029626
28030,1.363750,1.363750,-0.374182,-0.649653,0.000854,-0.307811,-0.692662,-0.428696,1.947947,-0.458532,...,-0.051359,-0.008771,-0.022538,-0.086525,-0.05933,-0.107751,-0.112895,-0.218044,5.805441,-0.029626


Tr****aining the Model# 

In [47]:
models={'Linear Regression':LinearRegression(),
'L-2 Regularisation Linear Regression':Ridge(),
'L-1 Regularisation Linear Regression':Lasso(),
'Huber Regression':HuberRegressor(),
'Linear Kernel Support Vector Machine':LinearSVR(),
'Decision Tree Regressor':DecisionTreeRegressor()}

In [52]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print('The accuracy of the model',model.score(x_test,y_test))    

Linear Regression
The accuracy of the model -4.498881094129298e+17
L-2 Regularisation Linear Regression
The accuracy of the model 0.22284417865899742
L-1 Regularisation Linear Regression
The accuracy of the model 0.22289167250584052
Huber Regression
The accuracy of the model 0.21120901157184424
Linear Kernel Support Vector Machine
The accuracy of the model 0.20053571331099196
Decision Tree Regressor
The accuracy of the model 0.03962493662836497


In [41]:
x.isna().sum().sum()

0

In [35]:
x.columns

Index(['year', 'wave', 'income', 'family_member', 'gender', 'year_born',
       'education_level', 'company_size', 'reg_1', 'reg_2',
       ...
       'rsn_11', 'rsn_2', 'rsn_3', 'rsn_4', 'rsn_5', 'rsn_6', 'rsn_7', 'rsn_8',
       'rsn_9', 'rsn_99'],
      dtype='object', length=282)

In [None]:
x['region'].unique()

In [None]:
#checking for the missing value 
x.isna().sum()

In [None]:
x['occupation'].unique()