In [1]:
# python version: python3

In [2]:
import pandas as pd
import numpy as np

## DATA EXPLORATION & PREPARATION

### part a

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train_x_a = train.drop(u'SalePrice',axis=1)
train_y = train[['SalePrice']]

In [5]:
train_x_a.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [6]:
train_y.head(2)

Unnamed: 0,SalePrice
0,208500
1,181500


### part b

In [7]:
train_y.isnull().sum()

SalePrice    0
dtype: int64

In [8]:
nulls = train_x_a.isnull().sum().to_dict()

In [9]:
nan_columns = [col for col,null in nulls.items() if null>0] 
train_x_b = train_x_a.fillna(train_x_a.median())

In [10]:
print(nan_columns)

['LotFrontage', 'MasVnrArea']


In [11]:
train_x_b.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


### part c

In [12]:
datatypes = train_x_a.dtypes.to_dict()

In [13]:
categorical_columns = [col for col,typ in datatypes.items() if typ=="O"]

In [14]:
print(categorical_columns)

['Street']


### part d

In [15]:
one_hot = pd.get_dummies(train_x_b['Street'])

In [16]:
train_x_d = pd.concat([train_x_b.drop(["Street"],axis=1),one_hot],axis=1)

In [17]:
train_x_d.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,60,65.0,8450,7,5,196.0,706,150,856,856,...,1,3,1,8,0,2,548,2,0,1
1,20,80.0,9600,6,8,0.0,978,284,1262,1262,...,0,3,1,6,1,2,460,5,0,1


### part e

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_x_d.values)
scaled_features = scaler.transform(train_x_d.values)

In [19]:
train_x_e =  pd.DataFrame(scaled_features, index=train_x_d.index, columns=train_x_d.columns)

In [20]:
train_x_e.head(7)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,0.073872,-0.235351,-0.196474,0.6329,-0.529618,0.464035,0.589782,-0.939672,-0.486827,-0.802481,...,1.230454,0.178216,-0.208547,0.935889,-0.939129,0.316364,0.357524,-1.601265,-0.063372,0.063372
1,-0.873204,0.475965,-0.095659,-0.090414,2.177118,-0.576236,1.204925,-0.638291,0.475863,0.280104,...,-0.773542,0.178216,-0.208547,-0.307817,0.600426,0.316364,-0.063938,-0.485919,-0.063372,0.063372
2,0.073872,-0.093088,0.04899,0.6329,-0.529618,0.28358,0.09224,-0.300924,-0.335073,-0.631827,...,1.230454,0.178216,-0.208547,-0.307817,0.600426,0.316364,0.644884,1.001209,-0.063372,0.063372
3,0.310641,-0.472456,-0.100042,0.6329,-0.529618,-0.576236,-0.51838,-0.062519,-0.723943,-0.522502,...,-0.773542,0.178216,-0.208547,0.314036,0.600426,1.662593,0.807722,-1.601265,-0.063372,0.063372
4,0.073872,0.66565,0.312864,1.356214,-0.529618,1.281391,0.474443,-0.174974,0.198437,-0.031872,...,1.230454,1.407294,-0.208547,1.557742,0.600426,1.662593,1.736853,2.116554,-0.063372,0.063372
5,-0.162897,0.713071,0.300152,-0.813729,-0.529618,-0.576236,0.648582,-1.133096,-0.629097,-0.962469,...,1.230454,-2.279938,-0.208547,-0.92967,-0.939129,0.316364,0.031849,1.37299,-0.063372,0.063372
6,-0.873204,0.23886,-0.053229,1.356214,-0.529618,0.41096,2.089193,-0.564071,1.481234,1.432018,...,-0.773542,0.178216,-0.208547,0.314036,0.600426,0.316364,0.778986,0.629427,-0.063372,0.063372


## LINEAR REGRESSION TO PREDICT HOUSE PRICES

### part f

In [21]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression().fit(train_x_e,train_y)

  linalg.lstsq(X, y)


In [22]:
from sklearn.metrics import mean_squared_error
predicts = lr_model.predict(train_x_e)
train_mse = mean_squared_error(train_y, predicts)
print(train_mse)

1041634341.4633691


### part g

In [23]:
from sklearn import cross_validation as CV

lr_model2 = LinearRegression()
cross_val_mse_scores = CV.cross_val_score(lr_model2, train_x_e,train_y, cv=5, scoring='mean_squared_error')

print(cross_val_mse_scores)

[-9.69013184e+08 -7.68184442e+08 -1.66781305e+09 -1.46841147e+09
 -8.84970495e+08]


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [24]:
average_cross_val_mse = sum(cross_val_mse_scores)/len(cross_val_mse_scores)
print(average_cross_val_mse)

-1151678527.0528073


### part h

In [25]:
test_x = test.drop(u'SalePrice',axis=1).fillna(train_x_a.median()) # Fill nans with MEDIAN OF TRAINING DATA VALUES
test_y = test[['SalePrice']]

In [26]:
one_hot = pd.get_dummies(test_x['Street'])

In [27]:
test_x = pd.concat([test_x.drop(["Street"],axis=1),one_hot],axis=1)

In [28]:
scaled_features = scaler.transform(test_x) # Use the same scaler that you used for training. DONT FIT TEST DATA
test_x =  pd.DataFrame(scaled_features, index=test_x.index, columns=test_x.columns)

In [29]:
test_x.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,-0.873204,0.191439,-0.042533,-2.260357,-2.334109,-0.576236,-1.006876,-1.277039,-2.516538,-0.567832,...,-0.773542,-1.050861,-0.208547,-1.551524,-0.939129,0.316364,0.261737,0.257645,-0.063372,0.063372
1,-0.636435,-0.472456,-0.463855,-0.813729,0.372627,-0.576236,-1.006876,0.277097,-0.878068,-1.242448,...,-0.773542,-1.050861,-0.208547,-1.551524,-0.939129,-1.029865,-1.232536,-1.973046,-0.063372,0.063372


In [30]:
test_y.head(2)

Unnamed: 0,SalePrice
0,82000
1,86000


### part i

In [31]:
predicted_values = lr_model.predict(test_x)

In [32]:
print(predicted_values[10:13])

[[119236.49420048]
 [ 83816.49420048]
 [146940.49420048]]


In [33]:
test_mse_score = mean_squared_error(test_y, predicted_values)
print(test_mse_score)

1936165973.8926947


## CLASSIFICATION MODEL TO PREDICT HOUSE PRICE CATEGORY

### part j

house price < 100.000 <br>
100.000 <= house price < 200.000 <br>
200.000 <= house price < 300.000 <br>
300.000 <= house price < 400.000 <br>
400.000 <= house price <br>

In [34]:
def categorize(x):
    if x<100000:
        return 1
    elif 100000<=x and x<200000:
        return 2
    elif 200000<=x and x<300000:
        return 3
    elif 300000<=x and x<400000:
        return 4
    else:
        return 5

In [35]:
train_y_j = train_y.applymap(lambda x: categorize(x))
test_y_j = test_y.applymap(lambda x: categorize(x))

In [37]:
train_y_j.head(2)

Unnamed: 0,SalePrice
0,3
1,2


In [38]:
test_y_j.head(2)

Unnamed: 0,SalePrice
0,1
1,1


### part k

In [39]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

In [40]:
lr = LogisticRegression()

In [41]:
model = OneVsOneClassifier(lr)

### part l

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
X = train_x_e.values
y = np.array([x[0] for x in train_y_j.values.tolist()])

fold = 1
for train_index, test_index in kf.split(X):
    print("\nFold: {}".format(fold))
    lr = LogisticRegression()
    model = OneVsOneClassifier(lr)
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    preds = model.predict(X_train)
    
    accuracy = accuracy_score(y_train,preds)
    precision = precision_score(y_train,preds, average="micro")
    recall = recall_score(y_train,preds, average="micro")
    f1 = f1_score(y_train,preds, average="micro")
    confusion_matrix_ = confusion_matrix(y_train,preds)
    print(confusion_matrix_)
    print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 
    
    fold+=1


Fold: 1
[[ 42  22   0   0   0]
 [  5 466  20   0   1]
 [  0  31 139   5   0]
 [  0   0  13  36   3]
 [  0   0   0   9   8]]
accuracy: 0.86375
precision: 0.86375
recall: 0.86375
f1: 0.86375

Fold: 2
[[ 31  25   0   0   0]
 [  7 474  21   0   1]
 [  0  31 134   5   1]
 [  0   0  14  39   0]
 [  0   0   0   5  12]]
accuracy: 0.8625
precision: 0.8625
recall: 0.8625
f1: 0.8625

Fold: 3
[[ 42  20   0   0   0]
 [  8 461  21   1   0]
 [  0  26 143   8   0]
 [  0   0  19  36   0]
 [  0   0   0   4  11]]
accuracy: 0.86625
precision: 0.86625
recall: 0.86625
f1: 0.86625

Fold: 4
[[ 37  23   0   0   0]
 [  8 464  22   0   1]
 [  0  29 142   7   1]
 [  0   0  16  32   2]
 [  0   0   0   7   9]]
accuracy: 0.855
precision: 0.855
recall: 0.855
f1: 0.855

Fold: 5
[[ 41  25   0   0   0]
 [  8 444  22   0   1]
 [  0  26 147   8   1]
 [  0   0  16  40   2]
 [  0   0   0   7  12]]
accuracy: 0.855
precision: 0.855
recall: 0.855
f1: 0.855


In [44]:
# Alternative Method
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(model, train_x_e, train_y_j.values.ravel(), cv=5)
accuracy = accuracy_score(train_y_j.values.ravel(), y_train_pred)
precision = precision_score(train_y_j.values.ravel(), y_train_pred, average = 'micro')
recall = recall_score(train_y_j.values.ravel(), y_train_pred, average = 'micro')
f1 = f1_score(train_y_j.values.ravel(), y_train_pred, average = 'micro')
conf_max = confusion_matrix(train_y_j.values.ravel(), y_train_pred)

In [45]:
print(conf_max)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[ 41  36   0   0   0]
 [ 13 567  32   1   1]
 [  0  40 164  15   2]
 [  0   0  22  41   4]
 [  0   0   2  13   6]]
accuracy: 0.819
precision: 0.819
recall: 0.819
f1: 0.819


### part m

In [47]:
model.fit(train_x_e, train_y_j)
predicted_values = model.predict(test_x)

  y = column_or_1d(y, warn=True)


In [48]:
print(predicted_values[20:23])

[2 3 2]


In [55]:
accuracy = accuracy_score(test_y_j,predicted_values)
precision = precision_score(test_y_j,predicted_values, average="micro")
recall = recall_score(test_y_j,predicted_values, average="micro")
f1 = f1_score(test_y_j,predicted_values, average="micro")
confusion_matrix_ = confusion_matrix(test_y_j.values.ravel(),predicted_values)

In [56]:
print(confusion_matrix_)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[ 24  13   0   0   0]
 [  4 273  19   0   1]
 [  0  18  76   5   0]
 [  0   0   3  16   1]
 [  0   0   0   3   4]]
accuracy: 0.8543478260869565
precision: 0.8543478260869565
recall: 0.8543478260869565
f1: 0.8543478260869565
