In [1]:
# when we have text data (non numeric) we can use hot encoding (binary 0,1 in rows)

# if we use integer than :
# Categorical Variables
# 1. Nominal - (city names) , (male or female) , (colors)
# 2. Ordinal - (customer satisfaction - satisfied,neutral disatisfied) , (high,medium,low)
# in ordinal there is some numeric relation like staisfied is 1 then neutral is 0.5 and disatisfied is 0 

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [23]:
df = pd.read_csv("1.csv")

In [24]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [25]:
dummies = pd.get_dummies(df.town) # to create dummies of rows
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [26]:
merged = pd.concat([df,dummies], axis="columns") # to merge dataframes
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [27]:
# we have to drop any 1 of the 3 dummy columns because it might create multicollinearity and it will create a trap for multi variables and will mess our ML model

In [28]:
final = merged.drop(["town","west windsor"],axis="columns")

In [29]:
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [30]:
# sklearn's linear regression can avoid dummy trap but its to good avoid is manually

In [37]:
model = LinearRegression()

In [38]:
X = final.drop('price',axis="columns")
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [39]:
Y = final.price

In [40]:
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [41]:
model.fit(X,Y)

LinearRegression()

In [44]:
model.predict([[2800,0,1]]) # for town - robinsville

array([590775.63964739])

In [45]:
model.predict([[3400,0,0]])  # for town - west windsor

array([681241.66845839])

In [47]:
model.score(X,Y) # for checking accuracy here it is 95 %

0.9573929037221873

In [48]:
# to do hot encoding 
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [50]:
dflb = df
dflb.town = lb.fit_transform(dflb.town)

In [51]:
dflb

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [53]:
X = dflb[["town","area"]].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [105]:
Y = dflb.price.values
Y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [106]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
oe = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [107]:
X = oe.fit_transform(X)
X

array([[1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.]])

In [108]:
X = X[:,1:] # to drop first column

In [109]:
X

array([[0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [110]:
model.fit(X,Y)

LinearRegression()

In [111]:
model.predict([[0,1,3400]])

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 98 is different from 3)