In [None]:
# Solution: https://github.com/dataquestio/solutions/blob/master/Mission240Solutions.ipynb

In [418]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
%matplotlib inline

In [419]:
TARGET_COLUMN = 'SalePrice'

### Get and transform data

In [489]:
def get_data(fname):
    data = pd.read_csv(fname, sep='\t')
    return data


def handle_missing_values(df, mv_cutoff=0.05):
    mvalues = (df.isnull().sum() / df.shape[0])
    # All columns with missing values
    mv_columns = mvalues[mvalues > 0].index
    # Drop columns with missing values > 5 %
    columns_to_drop = (mvalues[mvalues > mv_cutoff]).index
    df = df.drop(columns_to_drop, axis=1)
    # Fill missing values for the remaining columns
    columns = (df[mv_columns.difference(columns_to_drop)]
              .columns)
    for col in columns:
        df[col] = df[col].fillna(value=df[col].mode()[0])
    return df


def add_new_features(dframe):
    df = dframe.assign(
    years_build_sold = dframe['Yr Sold'] - dframe['Year Built'],
    years_mod_sold = dframe['Yr Sold'] - dframe['Year Remod/Add'],
    years_build_mod = dframe['Year Remod/Add'] - dframe['Year Built']
    )
    return df


def drop_not_useful_columns(dframe):
    '''
    Drops these columns:
        Order (Discrete): Observation number
        PID (Nominal): Parcel identification number  - can be used with city web site for parcel review.
        Mo Sold (Discrete): Month Sold (MM)
        Yr Sold (Discrete): Year Sold (YYYY)
        Year Built (Discrete): Original construction date
        Year Remod/Add (Discrete): Remodel date (same as construction date if no remodeling or additions)
        Garage Yr Blt (Discrete): Year garage was built
    '''
    cols_to_drop = ['Order', 'PID', 'Yr Sold', 'Mo Sold',
                    'Year Built', 'Year Remod/Add', 'Garage Yr Blt']
    return dframe.drop(cols_to_drop, axis=1)


def convert_object_to_categorical(dframe):
    # Transform object to categorical data
    df = dframe.copy()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category')
    return df


def convert_num_to_categorical(dframe):
    '''
    Convert numeric to categorical
    MS SubClass (Nominal): Identifies the type of dwelling involved in the sale.	

           020	1-STORY 1946 & NEWER ALL STYLES
           030	1-STORY 1945 & OLDER
           040	1-STORY W/FINISHED ATTIC ALL AGES
           045	1-1/2 STORY - UNFINISHED ALL AGES
           050	1-1/2 STORY FINISHED ALL AGES
           060	2-STORY 1946 & NEWER
           070	2-STORY 1945 & OLDER
           075	2-1/2 STORY ALL AGES
           080	SPLIT OR MULTI-LEVEL
           085	SPLIT FOYER
           090	DUPLEX - ALL STYLES AND AGES
           120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
           150	1-1/2 STORY PUD - ALL AGES
           160	2-STORY PUD - 1946 & NEWER
           180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
           190	2 FAMILY CONVERSION - ALL STYLES AND AGES
    '''
    cols_num_to_cat = ['MS SubClass']
    df = dframe.copy()
    for col in cols_num_to_cat:
        df[col] = df[col].astype('category')
    return df


def drop_cat_columns_unique_over_10(df):
    cat_columns = df.select_dtypes(include='category').columns
    for col in cat_columns:
        if df[col].unique().shape[0] > 10:
            df = df.drop(col, axis=1)
    return df


def normalize_data(df):
    dfn = df.select_dtypes(include=['int', 'float']).drop(TARGET_COLUMN, axis=1)
    df.loc[:, dfn.columns] = (dfn - dfn.min()) / (dfn.max() - dfn.min())
    return df


def convert_cat_to_dummy(df):
    return pd.get_dummies(df)


def drop_columns_with_low_variance(df, cutoff_num=0.0001, cutoff_cat=0.95):
    def num_cols_to_drop():
    # Drop numeric columns with low variance
        dfn = df.select_dtypes(include=['int', 'float'])
        res = dfn.var()
        return res[res < cutoff_num].index
    
    def cat_cols_to_drop():
    # Drop category columns where most of the values belong to a specific category
        dfc = df.select_dtypes(include='category')
        res = dfc.apply(lambda x: (x.value_counts() / x.shape[0] > cutoff_cat).any())
        return res[res == True].index
    
    cols_to_drop = num_cols_to_drop().union(cat_cols_to_drop())
    return df.drop(cols_to_drop, axis=1)


def transform_features(df):
    df = (df.pipe(add_new_features)
            .pipe(drop_not_useful_columns)
            .pipe(handle_missing_values)
            .pipe(convert_object_to_categorical)
            .pipe(convert_num_to_categorical)
            .pipe(drop_cat_columns_unique_over_10)
            .pipe(normalize_data)
            .pipe(drop_columns_with_low_variance)
            .pipe(convert_cat_to_dummy)
         )
    return df

### Select features

In [539]:
def select_all_features(df):
    return df.drop(TARGET_COLUMN, axis=1).columns


def select_highest_corr_features(df, corr_thresh=0.4):
    corr = np.abs(df.select_dtypes(include=['int64', 'float64']).corr()[TARGET_COLUMN]).sort_values(ascending=False).drop(TARGET_COLUMN)
    return corr[corr > corr_thresh].index


def select_features(df, select_func, *args, **kwargs):
    return select_func(df, *args, **kwargs)

### Train and test

In [540]:
def train_and_test(select_func=None, k=0, *args, **kwargs):
    data = get_data('data/AmesHousing.tsv')
    data = transform_features(data)
    selected_features = select_features(data, select_func, *args, **kwargs)
#     data = select_features(data, n_features)
    if k == 0:
        train = data[:1460]
        test = data[1460:]
        lr = LinearRegression()
        lr.fit(train[selected_features], train[TARGET_COLUMN])
        rmse = mean_squared_error(test[TARGET_COLUMN], lr.predict(test[selected_features])) ** (0.5)
        return rmse
    elif k == 1:
        data = data.sample(frac=1)
        train = data[:1460]
        test = data[1460:]
        lr = LinearRegression()
        lr.fit(train[selected_features], train[TARGET_COLUMN])
        rmse = mean_squared_error(test[TARGET_COLUMN], lr.predict(test[selected_features])) ** (0.5)
        return rmse
    else:
        kf = KFold(n_splits=k, shuffle=True, random_state=1)
        lr = LinearRegression()
        mses = cross_val_score(lr, data[selected_features], data[TARGET_COLUMN], scoring="neg_mean_squared_error", cv=kf)
                                        
        return np.mean(np.sqrt(np.abs(mses)))

In [544]:
train_and_test(select_func=select_highest_corr_features, k=6, corr_thresh=0.3)

34807.0286910009

In [537]:
data[select_all_features(data)].dtypes.value_counts()

uint8      229
float64     14
dtype: int64

In [538]:
data.dtypes.value_counts()

uint8      229
float64     14
int64        1
dtype: int64