In [107]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
import seaborn as sns
%matplotlib inline

In [108]:
data = pd.read_csv("AmesHousing.txt", sep = '\t')
train = data[:1460]
test = data[1460:]

In [109]:
def transform_features(df):
    if "PID" in df.columns:
        df.drop("PID", axis = 1, inplace = True)
    train_set = df
    train_set["years_until_remod"] = train_set["Year Remod/Add"] - train_set["Year Built"]    

    cutoff = 0.25 * len(train_set)
    train_null_counts = train_set.isnull().sum()
    df_missing_values = train_set[train_null_counts[(train_null_counts < cutoff) & (train_null_counts > 0)].index]
    float_cols = df_missing_values.select_dtypes(include=['float'])
    float_cols = float_cols.fillna(float_cols.mean())
    train_set = train_set.drop(float_cols.columns.values, axis=1)
    train_set = pd.concat([train_set, float_cols], axis = 1)
    text_cols = train_set.select_dtypes(include=['object'])
    
    #dropping columns with large number of unique values
    for cols in text_cols.columns:
        if len(text_cols[cols].value_counts()) < 5:
            text_cols.drop(cols, axis = 1)
    
    #dropping columns with low standard deviation
    text_cols = text_cols.drop(text_cols.var()[text_cols.var() < 0.2].index.values, axis=1)

    for col in text_cols:
        train_set[col] = train_set[col].astype('category')
    for col in text_cols:
        col_dummies = pd.get_dummies(train_set[col])
        train_set = pd.concat([train_set, col_dummies], axis=1)
        del train_set[col]
    
    train_null_counts = train_set.isnull().sum()
    df_missing_values = train_set[train_null_counts[(train_null_counts > 0)].index]
    train_set = train_set.drop(df_missing_values.columns.values, axis=1)
    return train_set

In [110]:
train_set = transform_features(train)
train_set.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 284 entries, Order to Partial
dtypes: float64(11), int64(28), uint8(245)
memory usage: 794.2 KB


In [112]:
test_set = transform_features(test)
test_set.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 1460 to 2929
Columns: 296 entries, Order to Partial
dtypes: float64(11), int64(28), uint8(257)
memory usage: 816.9 KB


In [116]:
def select_features(df):
    #correlations
    corrs = df[df.columns].corr()["SalePrice"]
    sorted_corrs = corrs.sort_values()
    strong_corrs = sorted_corrs[sorted_corrs > 0.3]
    corrsmat = df[strong_corrs.index].corr()
    #sns.heatmap(corrsmat)
    #highest correlations
    features = strong_corrs.index
    return corrsmat, df[features]

In [117]:
def train_and_test(df):
    _, df = select_features(df)
    train_df = df[df.columns.difference(['SalePrice'])]
    model = LinearRegression()
    kf = KFold(len(df))
    mse = cross_val_score(model, train_df, df["SalePrice"], scoring = "mean_squared_error", cv=kf)
    return mse

In [105]:
#corrsmat, df = select_features(train_set)
#sns.heatmap(corrsmat)

In [134]:
mse = train_and_test(train_set)

In [141]:
print(mse.dtype)

mse_list = mse
np.array(mse_list)**0.5
# print(np.mean(mse_list))

848016316.086
2.08897442196e+25
3.74699049315e+25
