In [100]:
import pandas as pd

In [101]:
df=pd.read_csv('homeprices.csv')

In [102]:
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [103]:
dummies = pd.get_dummies(df.town)

### Pandas has get_dummies method that takes pandas column and returns categorized columns for each label in the column.

In [104]:
dummies.head()

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


### Merge dummies dataframe to original dataframe

In [105]:
merged = pd.concat([df, dummies], axis='columns')

In [106]:
merged.head()

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0


### Since we have dummies columns here, we can drop town column

In [107]:
merged.drop('town', axis=1, inplace=True)

In [108]:
merged.head()

Unnamed: 0,area,price,monroe township,robinsville,west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0


### We can drop one of the dummy variable columns to avoid dummy variable trap

In [109]:
final = merged.drop('west windsor', axis=1)

In [110]:
final.head()

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0


In [111]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

In [112]:
X=final.drop('price', axis=1)
y=final.loc[:,['price']]

In [113]:
X.head()

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0


In [114]:
y.head()

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000


In [115]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [116]:
# Predict price of a house in robinson
model.predict([[2800, 0, 1]])

array([[590775.63964739]])

In [117]:
# Predict price of a house in west windsor
model.predict([[3400, 0, 0]])

array([[681241.66845839]])

In [118]:
model.score(X, y)

0.9573929037221873

### Model seems to be 95.7 p.c. correct

### We use one hot encoder from sklearn to categorize labels

In [119]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [120]:
dfle = df
dfle.town = le.fit_transform(dfle.town)

In [121]:
dfle.head()

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000


### We import label encoder from sklearn.preprocessing and pass to it the dataframe column that we want to transform using fit_transform function, it returns the encoded values of the label

In [122]:
X = dfle.drop('price', axis=1)
y = dfle.loc[:, ['price']]

In [123]:
model_with_le = LinearRegression()

In [124]:
model_with_le.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [125]:
# Predict price of a house in robinson
model.predict([[2800, 0, 1]])

array([[590775.63964739]])

In [126]:
# Predict price of a house in west windsor
model.predict([[3400, 0, 0]])

array([[681241.66845839]])

In [127]:
model_with_le.score(X, y)

0.9552018104317442

### We got 95.5 p.c. accuracy

In [128]:
from sklearn.preprocessing import OneHotEncoder

In [129]:
ohe = OneHotEncoder(categorical_features=[0])

In [130]:
X = ohe.fit_transform(X).toarray()
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

### We need to use One hot encoder to transform label encoded data into categorical data, we have to import one hot encoder from sklearn.preprocessing and mention index of categorical features using categorical_features while creating one hot encoder object. Using fit_transform, we get array of transformed values

### To avoid dummy variable trap we can remove one of the first three columns

In [131]:
X = X[:, 1:]

In [132]:
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [133]:
y

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000
5,585000
6,615000
7,650000
8,710000
9,575000


In [134]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [135]:
# Predict price of a house in robinson
model.predict([[1, 0, 2800]])

array([[590775.63964739]])

In [136]:
# Predict price of a house in west windsor
model.predict([[0, 1, 3400]])

array([[681241.6684584]])

In [137]:
model.score(X, y)

0.9573929037221875