In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
df_train_raw = pd.read_csv('./datasets/train.csv')

df = df_train_raw.copy()

In [3]:
df_test_raw = pd.read_csv('./datasets/test.csv')
df_test = df_test_raw.copy()

In [10]:
df_dummies = pd.get_dummies(df)

a = pd.Series(df_dummies.corr()['SalePrice']).sort_values(ascending=False) > 0.5

df_dummies = df_dummies.loc[:, a].dropna()

In [11]:
X = df_dummies.drop('SalePrice', axis=1)
y = df_dummies['SalePrice']

In [12]:
ss = StandardScaler()

In [13]:
X_sc = ss.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y)

In [15]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [16]:
knn.score(X_test, y_test)

0.7854714741380564

In [17]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [18]:
# testing Linear, Logistic, and K Neighbors, 
# I found that KNeighbors gives the greatest accuracy

In [19]:
params={
    'n_neighbors': list(range(3, 31, 2)),
    'leaf_size': list(range(10,100))
}

In [20]:
gs = GridSearchCV(knn, params, n_jobs=-1)

In [21]:
gs.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'leaf_size': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
                                       30, 31, 32, 33, 34, 35, 36, 37, 38, 39, ...],
                         'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
                                         25, 27, 29]})

In [22]:
gs.best_params_

{'leaf_size': 18, 'n_neighbors': 7}

In [23]:
gs.score(X_test, y_test)

0.7809868710027362

In [None]:
gs.predict()

In [24]:
columns = df_dummies.columns

In [25]:
columns = columns.drop('SalePrice')

In [26]:
df_test = pd.get_dummies(df_test)

In [27]:
df_test = df_test.dropna()

In [28]:
df_test

Unnamed: 0_level_0,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,69.0,9142,6,8,1910,1950,0.0,0,...,0,0,0,0,0,0,0,0,0,1
2414,528218130,60,58.0,17104,7,5,2006,2006,0.0,554,...,0,0,0,0,0,0,1,0,0,0
1989,902207150,30,60.0,8520,5,6,1923,2006,0.0,0,...,0,0,0,0,0,0,0,0,0,1
333,923228370,160,21.0,1890,4,6,1972,1972,0.0,294,...,0,0,0,0,0,0,0,0,0,1
1327,902427150,20,52.0,8516,4,6,1958,2006,0.0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,527377110,60,80.0,8000,6,6,1974,1974,0.0,931,...,0,0,0,0,0,0,0,0,0,1
1234,535126140,60,90.0,14670,6,7,1966,1999,410.0,575,...,0,0,0,0,0,0,0,0,0,1
1373,904100040,20,55.0,8250,5,5,1968,1968,0.0,250,...,0,0,0,0,0,0,0,0,0,1
1672,527425140,20,60.0,9000,4,6,1971,1971,0.0,616,...,0,0,0,0,0,0,0,0,0,1


In [29]:
X_test_set = df_test[columns]

In [31]:
ss = StandardScaler()

In [32]:
X_test_set_sc = ss.fit_transform(X_test_set)

In [33]:
X_test_set_sc

array([[-0.10454689, -1.94724858, -1.68189035, ..., -0.90547914,
        -0.33687808, -0.27933041],
       [ 0.61038488,  1.12590602,  1.0049387 , ...,  1.10438767,
        -0.33687808, -0.27933041],
       [-0.81947867, -1.53109223,  1.0049387 , ..., -0.90547914,
        -0.33687808, -0.27933041],
       ...,
       [-0.81947867, -0.09055101, -0.81826673, ..., -0.90547914,
        -0.33687808, -0.27933041],
       [-1.53441044,  0.00548507, -0.67432946, ...,  1.10438767,
        -0.33687808, -0.27933041],
       [-0.81947867, -0.50670736, -1.4419949 , ..., -0.90547914,
        -0.33687808, -0.27933041]])

In [34]:
predictions = pd.DataFrame(gs.predict(X_test_set_sc))

In [43]:
index = X_test_set.reset_index()

In [44]:
predictions.join(index['Id'])

Unnamed: 0,0,Id
0,143342.857143,2658
1,180489.857143,2414
2,145000.000000,1989
3,88121.428571,333
4,128471.428571,1327
...,...,...
672,172685.714286,1662
673,188857.142857,1234
674,135492.857143,1373
675,133971.428571,1672
