In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from IPython.display import display
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
dataset = pd.read_csv("tcd-ml-1920-group-income-train.csv")
testset = pd.read_csv("tcd-ml-1920-group-income-test.csv")

In [37]:
def preprocessing(dataset,testset):
    #Dropping features
    dataset=dataset.drop(['Profession'],axis=1)
    dataset=dataset.drop(['Country'],axis=1)
    dataset=dataset.drop(['Work Experience in Current Job [years]'],axis=1)
    dataset=dataset.drop(['Satisfation with employer'],axis=1)
    dataset=dataset.drop(['Size of City'],axis=1)
    dataset=dataset.drop(['Yearly Income in addition to Salary (e.g. Rental Income)'],axis=1)
    
    testset=testset.drop(['Profession'],axis=1)
    testset=testset.drop(['Country'],axis=1)
    testset=testset.drop(['Work Experience in Current Job [years]'],axis=1)
    testset=testset.drop(['Satisfation with employer'],axis=1)
    testset=testset.drop(['Size of City'],axis=1)
    testset=testset.drop(['Yearly Income in addition to Salary (e.g. Rental Income)'],axis=1)
         
   
    #Replacing missing values
    dataset['Year of Record'].fillna(dataset['Year of Record'].median(), inplace=True)
    dataset['Year of Record']=dataset['Year of Record'].replace(['#N/A'],'0')
    dataset['Housing Situation'].fillna(dataset['Housing Situation'], inplace=True)
    dataset['Housing Situation'] = dataset['Housing Situation'].str.replace(" ","")
    dataset['Crime Level in the City of Employement'].replace('', np.nan, inplace=True)
    dataset['Crime Level in the City of Employement'].fillna(dataset['Crime Level in the City of Employement'].median(),inplace=True)
    
    dataset['Gender'] = dataset['Gender'].replace(['0','nan' ], 'unknown') 
    dataset['Gender'] = dataset['Gender'].replace(['f'], 'female')
    dataset['University Degree'] = dataset['University Degree'].replace(['0','nan'], 'unknown')
    dataset['University Degree']=dataset['University Degree'].replace(['#N/A'],'0')
    dataset['Hair Color'] = dataset['Hair Color'].replace(['0','nan'], 'Unknown')
       
    
    testset['Year of Record'].fillna(testset['Year of Record'].median(), inplace=True)
    testset['Year of Record']=testset['Year of Record'].replace(['#N/A'],'0')
    testset['Housing Situation'].fillna(testset['Housing Situation'], inplace=True)
    testset['Housing Situation'] = testset['Housing Situation'].str.replace(" ","")
    testset['Crime Level in the City of Employement'].replace('', np.nan, inplace=True)
    testset['Crime Level in the City of Employement'].fillna(testset['Crime Level in the City of Employement'].median(),inplace=True)
    
    testset['Gender'] = testset['Gender'].replace(['0','nan' ], 'unknown') 
    testset['Gender'] = testset['Gender'].replace(['f'], 'female')
    testset['University Degree'] = testset['University Degree'].replace(['0','nan'], 'unknown')
    testset['University Degree']=testset['University Degree'].replace(['#N/A'],'0')
    testset['Hair Color'] = testset['Hair Color'].replace(['0','nan'], 'Unknown')
    
    return dataset,testset
    


In [38]:
def one_hot_encoding(dataset,testset):
#DATASET    
    dataset=pd.get_dummies(dataset,columns=['Gender'],prefix=['Gender'])
    dataset=pd.get_dummies(dataset,columns=['Housing Situation'],prefix=['Housing Situation'])
    dataset=pd.get_dummies(dataset,columns=['Hair Color'],prefix=['Hair Color']) 
    dataset=pd.get_dummies(dataset,columns=['Year of Record'],prefix=['Year of Record']) 
    dataset=pd.get_dummies(dataset,columns=['University Degree'],prefix=['University Degree'])

#TESTSET
    
    testset=pd.get_dummies(testset,columns=['Gender'],prefix=['Gender'])
    testset=pd.get_dummies(testset,columns=['Housing Situation'],prefix=['Housing Situation'])
    testset=pd.get_dummies(testset,columns=['Hair Color'],prefix=['Hair Color']) 
    testset=pd.get_dummies(testset,columns=['Year of Record'],prefix=['Year of Record']) 
    testset=pd.get_dummies(testset,columns=['University Degree'],prefix=['University Degree']) 
    return dataset,testset    

In [39]:
def model_predict(dataset,testset):
    
    Y=dataset['Total Yearly Income [EUR]'].astype(float)
    X=dataset.drop(['Total Yearly Income [EUR]'], axis=1)
    xtrain,xvalidate,ytrain,yvalidate=train_test_split(X,Y,test_size=0.2,random_state=0)
    x_test=testset.drop("Total Yearly Income [EUR]",axis=1)
    #print(xtrain)
    #print(Y)

#Applying model
    regressor=BayesianRidge()
    regressor.fit(xtrain,ytrain)
    y_predict=regressor.predict(xvalidate)
    result=regressor.predict(x_test)
    res=pd.DataFrame(x_test['Instance'])
    res['Total Yearly Income [EUR]']=result
    res.index=x_test.index
    res.to_csv("result.csv")
    rms=np.sqrt(mean_squared_error(yvalidate,y_predict))
    print("rmes is "+ str(rms))


In [40]:
def run_rmse(dataset,testset):
    print("Preprocessing Dataset and TestSet")
    dataset,testset=preprocessing(dataset,testset)
    print("One hot encoding")
    dataset,testset=one_hot_encoding(dataset,testset)
    print("Using predict model")
    model_predict(dataset,testset)
    
if __name__ == '__main__':
    run_rmse(dataset,testset)    

Preprocessing Dataset and TestSet
One hot encoding
Using predict model
rmes is 69108.67368154944
