In [211]:
# Generate your regression and classification models using the training data. We expect that within this process, you'll be making use of:
# train-test split
# cross-validation / grid searching for hyperparameters
# strong exploratory data analysis to question correlation and relationship across predictive variables
# code that reproducibly and consistently applies feature transformation (such as the preprocessing library)
# Predict the values for your target columns in the test dataset and submit your predictions to Kaggle to see how your model does against unknown data.

In [212]:
import warnings
warnings.filterwarnings('ignore')

In [213]:
#Import the packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew
%matplotlib inline

In [214]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [215]:
train.shape

(2051, 82)

In [216]:
sale_condition=[]
for i in train['Sale Condition']:
    if i=='Normal':
        sale_condition.append(0)
    else:
        sale_condition.append(1)

In [217]:
train['Sale Condition']=sale_condition

In [218]:
total = full.isnull().sum().sort_values(ascending=False)
percent = (full.isnull().sum()/full.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data[missing_data['Percent']>0.2]

Unnamed: 0,Total,Percent
Pool QC,2917,0.995563
Misc Feature,2824,0.963823
Alley,2732,0.932423
Fence,2358,0.804778
Fireplace Qu,1422,0.485324
SalePrice,879,0.3


In [219]:
train_labels = train.pop('Sale Condition')
features = pd.concat([train, test], keys=['train', 'test'])

In [220]:
features.drop(['Pool QC', 'Misc Feature', 'Fireplace Qu', 'Fence', 'Alley','PID','SalePrice'],
              axis=1, inplace=True)
features.shape

(2930, 74)

In [221]:
features.isnull().sum().sort_values(ascending=False).head(23)

Lot Frontage      490
Garage Yr Blt     159
Garage Qual       159
Garage Finish     159
Garage Cond       159
Garage Type       157
Bsmt Exposure      83
BsmtFin Type 2     81
Bsmt Qual          80
BsmtFin Type 1     80
Bsmt Cond          80
Mas Vnr Area       23
Mas Vnr Type       23
Bsmt Full Bath      2
Bsmt Half Bath      2
Total Bsmt SF       1
Electrical          1
Garage Cars         1
Garage Area         1
BsmtFin SF 2        1
Bsmt Unf SF         1
BsmtFin SF 1        1
Functional          0
dtype: int64

In [222]:
features['Lot Frontage'] = features['Lot Frontage'].fillna(features['Lot Frontage'].mean())
for col in ('Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond'):
    features[col] = features[col].fillna('NoGRG')


In [223]:
features['Garage Cars'] = features['Garage Cars'].fillna(0.0)
features['Garage Area'] = features['Garage Area'].fillna(0.0)
features.drop(['Total Bsmt SF',  'Garage Yr Blt'], axis=1, inplace=True)

In [224]:
features['BsmtFin SF 1'] = features['BsmtFin SF 1'].fillna(0)   
features['BsmtFin SF 2'] = features['BsmtFin SF 2'].fillna(0)  

In [225]:
features['Bsmt Full Bath'] = features['Bsmt Full Bath'].fillna(features['Bsmt Full Bath'].median())
features['Bsmt Half Bath'] = features['Bsmt Half Bath'].fillna(features['Bsmt Half Bath'].median())

In [226]:
features['Bsmt Unf SF'] = features['Bsmt Unf SF'].fillna(features['Bsmt Unf SF'].mean())
features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])

In [227]:
for col in ('Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Bsmt Cond','Bsmt Qual'):
    features[col] = features[col].fillna('NoBasement')

In [228]:
features['Mas Vnr Area'] = features['Mas Vnr Area'].fillna(0.0)
features['Mas Vnr Type'] = features['Mas Vnr Type'].fillna('None')

In [229]:
features.isnull().sum().sort_values(ascending=False).head(5)

Yr Sold           0
Year Remod/Add    0
Enclosed Porch    0
Exter Cond        0
Exter Qual        0
dtype: int64

In [230]:
features.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2930 entries, (train, 0) to (test, 878)
Data columns (total 72 columns):
1st Flr SF         2930 non-null int64
2nd Flr SF         2930 non-null int64
3Ssn Porch         2930 non-null int64
Bedroom AbvGr      2930 non-null int64
Bldg Type          2930 non-null object
Bsmt Cond          2930 non-null object
Bsmt Exposure      2930 non-null object
Bsmt Full Bath     2930 non-null float64
Bsmt Half Bath     2930 non-null float64
Bsmt Qual          2930 non-null object
Bsmt Unf SF        2930 non-null float64
BsmtFin SF 1       2930 non-null float64
BsmtFin SF 2       2930 non-null float64
BsmtFin Type 1     2930 non-null object
BsmtFin Type 2     2930 non-null object
Central Air        2930 non-null object
Condition 1        2930 non-null object
Condition 2        2930 non-null object
Electrical         2930 non-null object
Enclosed Porch     2930 non-null int64
Exter Cond         2930 non-null object
Exter Qual         2930 non-null obje

In [231]:
for col in features.dtypes[features.dtypes == 'object'].index:
    if col!='Sale Condition':
        for_dummy = features.pop(col)
        features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [232]:
features.shape

(2930, 286)

In [233]:
train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values

In [270]:
x_train, x_test, ydd_train, y_test = train_test_split(train_features,train_labels,test_size=0.33,random_state=200)

In [236]:
from sklearn.metrics import r2_score, mean_squared_error
def get_score(prediction, lables):    
    print('R2: {}'.format(r2_score(prediction, lables)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))

# Shows scores for train and validation sets    
def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
    prediction_train = estimator.predict(x_trn)
    # Printing estimator
    print(estimator)
    # Printing train scores
    get_score(prediction_train, y_trn)
    prediction_test = estimator.predict(x_tst)
    # Printing test scores
    print("Test")
    get_score(prediction_test, y_tst)

In [243]:
from sklearn.neighbors import KNeighborsClassifier
# make an instance of a KNeighborsClassifier object with 1 neighbor
knn = KNeighborsClassifier(n_neighbors=x_train.shape[0])

# fit on the unstandardized data:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1374, p=2,
           weights='uniform')

In [244]:
x_train.shape[0]

1374

In [245]:
y_pred = knn.predict(x_test)

# compute classification accuracy
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred))

0.834564254062


In [249]:
predictions_knn=knn.predict(test_features)

In [255]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [250]:
test_id = test.Id

In [253]:
test_submit = pd.DataFrame({'Id': test_id, 'Sale Condition': predictions_knn})
test_submit.shape
test_submit.head()

Unnamed: 0,Id,Sale Condition
0,2658,0
1,2718,0
2,2414,0
3,1989,0
4,625,0


In [254]:
test_submit.to_csv('knn.csv', index=False)

In [257]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.metrics import confusion_matrix

In [277]:
ss = StandardScaler()
ss.fit(x_train)
x_train_std = ss.transform(x_train)

In [278]:
ss.fit(x_test)
x_test_std = ss.transform(x_test)

In [279]:
lr_01 = LogisticRegression(penalty='l1', C=0.1)
lr_01.fit(x_train_std, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [280]:
lr_1 = LogisticRegression(penalty='l1', C=1.0)
lr_1.fit(x_train_std, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [281]:
lr_10 = LogisticRegression(penalty='l1', C=10.0)
lr_10.fit(x_train_std, y_train)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [282]:
print(lr_01.score(x_train_std, y_train))
print(lr_01.score(x_test_std, y_test))

0.919213973799
0.914327917282


In [293]:
predictions_log=lr_01.predict(test_features)

In [294]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,