In [34]:
import math
import pandas as pd
import numpy as np
from sklearn import linear_model

In [35]:
df = pd.read_csv('data/homeprices.csv')
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [36]:
median_bedroom = math.floor(df.bedrooms.median())
df.bedrooms = df.bedrooms.fillna(median_bedroom)
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [37]:
reg_model = linear_model.LinearRegression()
reg_model.fit(df[['area', 'bedrooms', 'age']],df.price)

In [38]:
reg_model.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [39]:
reg_model.intercept_

221323.00186540408

In [40]:
reg_model.predict([[3000, 3, 40]])



array([498408.25158031])

In [41]:
3000*112.06244194 + 3*23388.88007794 + 40*-3231.71790863 + 221323.00186540408

498408.2515740241

In [42]:
reg_model.predict([[2500, 4, 5]])



array([578876.03748933])

# Exercise

In exercise folder (same level as this notebook on github) there is hiring.csv. This file contains hiring statics for a firm such as experience of candidate, his written test score and personal interview score. Based on these 3 factors, HR will decide the salary. Given this data, you need to build a machine learning model for HR department that can help them decide salaries for future candidates. Using this predict salaries for following candidates,

2 yr experience, 9 test score, 6 interview score

12 yr experience, 10 test score, 10 interview score

In [43]:
df = pd.read_csv('data/hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [44]:
# Rename Columns
df.rename(columns={
    'test_score(out of 10)': 'test_score', 
    'interview_score(out of 10)': 'interview_score',
    'salary($)': 'salary'
}, inplace=True)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [45]:
# Replace experience NaN Value
from word2number import w2n
df.experience.fillna('zero', inplace=True)
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [46]:
df.experience = df.experience.apply(w2n.word_to_num)
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000


In [47]:
# Replace test score NA with median

df.test_score.fillna(df.test_score.median(), inplace=True)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [48]:
reg_model = linear_model.LinearRegression()
reg_model.fit(df[['experience','test_score','interview_score']], df.salary)

In [49]:
reg_model.coef_

array([2812.95487627, 1845.70596798, 2205.24017467])

In [50]:
reg_model.intercept_

17737.263464337695

In [51]:
reg_model.predict([[2, 9, 6]])



array([53205.96797671])

In [53]:
reg_model.predict([[12,10,10]])



array([92002.18340611])