In [None]:
#!/usr/bin/env python 3

# This script pull in salary data, builds and tests several predictive models
# and then makes salary predictions on test data using the best model

__author__ = 'Tim McDonough'
__email__ = 'timothylmcdonough@gmail.com'

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def load_file(file):
    '''loads csv to pd dataframe'''
    return pd.read_csv(file)

def consolidate_data(df1, df2, key=None, left_index=False, right_index=False):
    '''perform inner join to return only records that are present in both dataframes'''
    return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)

def clean data(raw_df):
    '''remove rows that contain salary <= 0 or dupliate job IDS'''
    clean_df = raw_df.drop_duplicates(subset='jobID')
    clean_df = clean_df[clean_df.salary]
    return clean_df

def one_hot_encode_feature_df(df, cat_vars=None, num_vars=None):
    '''performs one-hot encoding on all categorical variables and combines result with continuous variables'''
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis=1)#ignore_index=False)

def get_target_df(df, target):
    '''returns target dataframe'''
    return df[target]

def train_model(model, feature_df, target_df, num procs, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=2, n_jobs=num_procs, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)
    
def print_summary(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])
    
def save_results(model, mean_mse, predictions, feature_importances):
        '''saves model, model summary, feature importances, and predictions'''''
        with open('model.txt', 'w') as file:
                file.write(str(model))
        feature_importances.to_csv('feature_importances.csv')
        np.savetxt('predictions.csv', predictions, delimiter=',')
        
if __name__ = '__main__':
    #define inputs
    train_freature_file = 'data/train_features.csv'
    train_target_file = 'data/train_salaries.csv'
    test_feature_file = 'data/test_feature.csv'
    
    #define variables
    categorical_vars =['companyId', 'jobType', 'degree,' ]