### Author = 'Timothy McDonough'
### Email = 'timothylmcdonough@gmail.com'

## Part 4: Modeling the Data  
### 4a. Preparation

Importing libraries to perform preparation:

In [39]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Lets define a "Data" class to create functions needed to prepare the data set:

In [47]:
class Data:
    def __init__(self, train_feature_df, train_target_df, test_df, cat_vars, num_vars, target_var, id_col):
        '''create train and test dataframes'''
        #create new copies instead of references
        self.cat_vars = list(cat_vars)
        self.num_vars = list(num_vars)
        self.feature_vars = cat_vars + num_vars
        self.target_col = target_var
        self.id_col = id_col
        self.label_encoders = {}
        self.train_df = self._create_train_df(train_feature_df, train_target_df)
        self.test_df = self._create_test_df(test_df)
    
    def label_encode_df(df, cols):
        '''creates one label encoder for each column in the data object instance'''
        for col in cols:
            if col in label_encoders:
                #if label encoder already exits for col, use it
                _label_encode(df, col, label_encoders[col])
            else:
                _label_encode(df, col)
    
    def inverse_encode_df(self, df, cols):
        '''does inverse label encoding'''
        for col in cols:
            if col in label_encoders:
                self._inverse_label_encode(df, col)  
            else:
                raise ValueError("label_encoders must be define for each col before calling inverse_encode_df")

    def _label_encode(self, df, col, le=None):
        '''label encodes data'''
        if le:
            df[col] = le.transform(df[col])
        else:
            le = LabelEncoder()
            le.fit(df[col])
            df[col] = le.transform(df[col])
            self.label_encoders[col] = le
            le_dict.get(new_item, '<Unknown>')
        
    def _inverse_label_encode(self, df, col):
        '''inverse label encodes data'''
        le = self.label_encoders[col]
        df[col] = le.inverse_transform(df[col])
 
    def _create_train_df(self, train_feature_df, train_target_df, preprocess=True, label_encode=True):
        '''loads and merges training data features and targets, preprocesses data, encodes data'''
        train_feature_df = self._load_data(train_feature_df)
        train_target_df = self._load_data(train_target_df)
        train_df = self._merge_dfs(train_feature_df, train_target_df)
        if preprocess:
            train_df = self._clean_data(train_df)
            train_df = self._shuffle_data(train_df)
        if label_encode:
            self.label_encode_df(train_df, self.cat_cols)
        return train_df
    
    def _create_test_df(self, test_file, label_encode=True):
        '''loads and label encodes test data'''
        test_df = self._load_data(test_file)
        if label_encode:
            self.label_encode_df(test_df, self.cat_cols)
        return test_df
        
    def _load_data(self, file):
        return pd.read_csv(file)
    
    def _merge_dfs(self, df1, df2, key=None, left_index=True, right_index=True):
        return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)
    
    def _clean_data(self, df):
        '''remove rows that contain salary <= 0 or duplicate job IDs'''
        df = df.drop_duplicates(subset='id')
        #df = df[df.salary>0]
        return df
    
    def _shuffle_data(self, df):
         return shuffle(df).reset_index()
        
    def load_data(file):
        return pd.read_csv(file)
    
    def one_hot_encode_feature_df(df, cat_vars=None, num_vars=None):
        '''performs one-hot encoding on all categorical variables and combines result with continuous variables'''
        cat_df = pd.get_dummies(df[cat_vars])
        num_df = df[num_vars].apply(pd.to_numeric)
        return pd.concat([cat_df, num_df], axis=1)#ignore_index=False)
    
    def split_data(df):
        '''splits data into train and test sets'''
        train_feature_df, test_df = train_test_split(df, test_size=0.2)
        return pd.DataFrame(train_feature_df), pd.DataFrame(test_df)
    
    def get_target_df(df, target):
        '''returns target dataframe'''
        return df[target]


Now we can pull in the data file and prepare our data for modeling.

In [95]:
if __name__ == '__main__':
    
    #move working directory to correct location
    os.chdir('/Users/timothylmcdonough/DSDJ/NYC_AirBnB/Data')
    
    #load file
    raw_df = Data.load_data('NYC_2019.csv')
    
    #create dataframe for Bronx only
    Bronx_df = pd.DataFrame(raw_df[raw_df['neighbourhood_group'] == 'Bronx'])
    
    #define variables
    cat_cols = ['name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'room_type', 'last_review']
    #cat_cols = ['neighbourhood_group', 'neighbourhood', 'room_type']

    #numeric_vars = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
    num_cols = ['host_id', 'id', 'number_of_reviews', 'latitude', 'longitude', 'mininum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
    target_cols = ['host_id', 'price']
    #id_col = 'host_id'

    #clean, shuffle, and reindex training data -- shuffling may improve cross-validation accuracy
    clean_train_df = shuffle(Bronx_df)
    
    #get target dataframe
    #target_df = Data.get_target_df(Bronx_df, id_col, target_col)
    target_df = Data.get_target_df(clean_train_df, target_cols)

    #drop columns not helpful to predicitons
    #clean_train_df = clean_train_df.drop(columns=['id', 'name', 'host_name', 'room_type', 'last_review', 'latitude', 'longitude', 'minimum_nights', 'last_review', 'reviews_per_month'])
    clean_train_df = clean_train_df.drop(columns=['id', 'name', 'host_name', 'room_type', 'last_review', 'latitude', 'longitude', 'minimum_nights', 'last_review', 'calculated_host_listings_count', 'number_of_reviews', 'reviews_per_month', 'availability_365'])

    #encode dataset
    #raw_encode_df = Data.one_hot_encode_feature_df(raw_df, cat_cols, num_cols)
    
    #split dataset
    train_feature_df, test_df = Data.split_data(clean_train_df)
    
    #write dataframes to .csv files for modeling Jupyter notebook
    train_feature_df.to_csv('train_features.csv', header=True, index=False)
    test_df.to_csv('test_features.csv', header=True, index=False)
    target_df.to_csv('train_prices.csv', header=True, index=False)