In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [2]:
df = pd.read_csv('homeprices.csv')

In [3]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [4]:
# data preprocessing - handling missing values

In [5]:
df.bedrooms.median()

4.0

In [6]:
df.fillna(df.bedrooms.median())

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [7]:
df.bedrooms.fillna(df.bedrooms.median()) # similar as above

0    3.0
1    4.0
2    4.0
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [8]:
df['bedrooms'] = df.bedrooms.fillna(df.bedrooms.median())

In [9]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [10]:
reg = linear_model.LinearRegression()
reg.fit(df.drop('price',axis = 1),df.price)

In [11]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [12]:
reg.intercept_

221323.00186540408

In [13]:
predicted = reg.predict(df.drop('price',axis = 1))

In [14]:
df['predicted_price'] = predicted

In [15]:
df

Unnamed: 0,area,bedrooms,age,price,predicted_price
0,2600,3.0,20,550000,518217.632976
1,3000,4.0,15,565000,602590.079374
2,3200,4.0,18,610000,615307.414037
3,3600,3.0,30,595000,597962.895832
4,4000,5.0,8,760000,760663.426755
5,4100,6.0,8,810000,795258.551027


In [16]:
''' 
predict the home prices that has:

a) 3000 sq-ft area, 3 bed rooms , 40 year old
b) 2500 sq-ft area, 4 bed rooms, 5 year old
'''

' \npredict the home prices that has:\n\na) 3000 sq-ft area, 3 bed rooms , 40 year old\nb) 2500 sq-ft area, 4 bed rooms, 5 year old\n'

In [17]:
x_test = np.array([[3000,3,40],[2500,4,5]])

In [18]:
reg.predict(x_test)



array([498408.25158031, 578876.03748933])

In [19]:
# checking from mathematical equation

In [20]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [21]:
reg.intercept_

221323.00186540408

In [22]:
a = np.dot(np.array([3000,3,40]),reg.coef_) + reg.intercept_
b = np.dot(np.array([2500,4,5]),reg.coef_) + reg.intercept_

In [23]:
print(a,b)

498408.25158030697 578876.0374893326


# Exercise

In [24]:
data = pd.read_csv('hiring.csv')

In [25]:
data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [26]:
pip install word2number

Note: you may need to restart the kernel to use updated packages.


In [27]:
import word2number as w2n 
# great library to convert words from number
# not working check later

In [28]:
def convert_word_to_number(word):
    try:
        return w2n.word_to_num(word)
    
    except ValueError:
        # Handle cases where word2number library can't recognize the word
        return None

In [29]:
# data preprocessing - handling missing values

In [30]:
data['test_score(out of 10)'] = data['test_score(out of 10)'].fillna(data['test_score(out of 10)'].mean())

In [31]:
data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [32]:
data.experience = data.experience.fillna('zero')

In [33]:
data.experience

0      zero
1      zero
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [34]:
def word_to_number(word):
    if word == 'zero':
        return 0
    elif word == 'one':
        return 1
    elif word == 'two':
        return 2
    elif word == 'three':
        return 3
    elif word == 'four':
        return 4
    elif word == 'five':
        return 5
    elif word == 'six':
        return 6
    elif word == 'seven':
        return 7
    elif word == 'eight':
        return 8
    elif word == 'nine':
        return 9
    elif word == 'ten':
        return 10
    elif word == 'eleven':
        return 11

In [35]:
data.experience = data.experience.apply(word_to_number)

In [36]:
data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.857143,7,72000
7,11,7.0,8,80000


In [37]:
reg1 = linear_model.LinearRegression()

In [38]:
reg1.fit(data.drop('salary($)',axis = 1),data['salary($)'])

In [39]:
reg1.predict(np.array([[2,9,6],[12,10,10]]))



array([53290.89255945, 92268.07227784])