In [2]:
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,town,area,price
0,Mumbai,2600,550000
1,Mumbai,3000,565000
2,Mumbai,3200,610000
3,Mumbai,3600,680000
4,Mumbai,4000,725000
5,Delhi,2600,585000
6,Delhi,2800,615000
7,Delhi,3300,650000
8,Delhi,3600,710000
9,Chennai,2600,575000


### Using pandas to create dummy variables

In [8]:
dummies = pd.get_dummies(df.town) #drop_first=True
dummies

Unnamed: 0,Chennai,Delhi,Mumbai
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
5,0,1,0
6,0,1,0
7,0,1,0
8,0,1,0
9,1,0,0


In [9]:
df_dummies= pd.concat([df,dummies],axis='columns')
df_dummies

Unnamed: 0,town,area,price,Chennai,Delhi,Mumbai
0,Mumbai,2600,550000,0,0,1
1,Mumbai,3000,565000,0,0,1
2,Mumbai,3200,610000,0,0,1
3,Mumbai,3600,680000,0,0,1
4,Mumbai,4000,725000,0,0,1
5,Delhi,2600,585000,0,1,0
6,Delhi,2800,615000,0,1,0
7,Delhi,3300,650000,0,1,0
8,Delhi,3600,710000,0,1,0
9,Chennai,2600,575000,1,0,0


In [10]:
df_dummies.drop('town',axis='columns',inplace=True)
df_dummies

Unnamed: 0,area,price,Chennai,Delhi,Mumbai
0,2600,550000,0,0,1
1,3000,565000,0,0,1
2,3200,610000,0,0,1
3,3600,680000,0,0,1
4,4000,725000,0,0,1
5,2600,585000,0,1,0
6,2800,615000,0,1,0
7,3300,650000,0,1,0
8,3600,710000,0,1,0
9,2600,575000,1,0,0


### Dummy Variable Trap
When you can derive one variable from other variables, they are known to be multi-colinear. Here if you know values of Chennai and Mumbai then you can easily infer value of Delhi, i.e. Chennai=0 and Mumbai=0. There for these variables are said to be multi-collinear. In this situation linear regression won't work as expected. Hence you need to drop one column.

NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of these columns it is going to work, however we should make a habit of taking care of dummy variable trap ourselves just in case library that you are using is not handling this for you

In [11]:
df_dummies.drop('Mumbai',axis='columns',inplace=True)
df_dummies

Unnamed: 0,area,price,Chennai,Delhi
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


In [12]:
X = df_dummies.drop('price',axis='columns')
y = df_dummies.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 , random_state=1)

In [13]:
X_train.shape

(10, 3)

In [14]:
X_test.shape

(3, 3)

In [36]:
y_test.shape

(3,)

In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [16]:
X_train

Unnamed: 0,area,Chennai,Delhi
10,2900,1,0
1,3000,0,0
6,2800,0,1
0,2600,0,0
7,3300,0,1
12,3600,1,0
9,2600,1,0
8,3600,0,1
11,3100,1,0
5,2600,0,1


In [45]:
#//
model.predict(X_test)

array([602590.90909091, 647681.81818182, 692772.72727274])

### Watch Out

In [23]:
predict = pd.DataFrame(model.predict(X_test), columns = ["Prediction"])
predict

Unnamed: 0,Prediction
0,602590.909091
1,647681.818182
2,692772.727273


### END

In [46]:
#//
y_test

2    610000
3    680000
4    725000
Name: price, dtype: int64

### WATCH OUT

In [40]:
actual = pd.DataFrame(y_test)
actual.reset_index(inplace = True, drop = True) 
actual

Unnamed: 0,price
0,610000
1,680000
2,725000


In [42]:
predict_vs_actual = pd.concat([predict,actual], axis = 1)
predict_vs_actual

Unnamed: 0,Prediction,price
0,602590.909091,610000
1,647681.818182,680000
2,692772.727273,725000


### END

In [44]:
model.score(X_train,y_train)

0.9593125005441832

In [15]:
model.predict([[5000,0,0]]) ## 3400 sqft house in Mumbai

array([844263.59922597])

In [13]:
model.predict([[2800,0,1]]) ## 2800 sqft house in Delhi

array([605103.20361213])