In [2]:
#441702 rwelish@wustl.edu Welish Ryan
# X X tmanders@wustl.edu Manders Toby
# X X Foeller Sebastain

import os
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def loadTrainData(path='../input/train.csv'):
    '''
        input: path to file
        output:
        X: nxd
        y: 1xn
    '''
    df = pd.read_csv(path)
    #print(f"Loaded {os.path.basename(path)}. Shape: {df.shape}")
    return df.drop('Horizontal_Distance_To_Fire_Points', axis=1), df['Horizontal_Distance_To_Fire_Points']

def loadFeatureAddedTrainData(path='../input/train.csv'):
    df = pd.read_csv(path)
    inverse_cols =["dist_to_hydro"]
    df["dist_to_hydro"] = np.sqrt(np.square(df["Horizontal_Distance_To_Hydrology"])+
    np.square(df["Vertical_Distance_To_Hydrology"]))
    df = df.drop(['Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Hydrology'],axis=1)
    for col in inverse_cols:
        df.loc[df[col] > 0, col] = 1/df[df[col]>0][col]
    return df.drop(['Horizontal_Distance_To_Fire_Points','ID'], axis=1), df['Horizontal_Distance_To_Fire_Points']


def loadTestData(path='../input/test.csv'):
    '''
        input: path to file
        output:
        X: nxd
        y: 1xn
    '''
    df = pd.read_csv(path)
    print(f"Loaded {os.path.basename(path)}. Shape: {df.shape}")
    return df

def rmse(yTr,yTe):
    '''input true labels and predictions - returns root mean square error'''
    return math.sqrt(mean_squared_error(yTr, yTe))


def tuneAlpha(model):
    bestAlpha = -1
    bestScore = 10000
    for alpha in np.linspace(.01,.2,num=20):
        regressor = model(alpha = alpha,normalize = True)
        score = runCrossValidation(regressor)
        if(score < bestScore):
            bestAlpha,bestScore = alpha,score
    print("Best alpha was {} with score {}".format(bestAlpha,bestScore))

def runCrossValidation(untrainedModel,folds=10):
    X,y = loadFeatureAddedTrainData()
    scores = np.sqrt(-cross_val_score(untrainedModel, X, y, cv=folds,scoring="neg_mean_squared_error"))
    print("Error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores.mean()

def runExperiment(model, test_size=.2, featureEnginnering = False):
    '''
    Runs a model on the training data by splitting it into train/validation sets
    input:
        model: model to be run - function taking in (Xtr,yTr,Xte) returning predictions
    '''
    X,y = loadFeatureAddedTrainData() if featureEnginnering else loadTrainData()
    xTr,xVal, yTr, yVal = train_test_split(X,y,test_size = test_size)
    preds = model(xTr,yTr,xVal)
    error = rmse(preds,yVal)
    print("Error of {} \n".format(error))
    return error

def predictOnTestData(model):
    '''
    Runs the model over test data and outputs prediction file
    input:
        model: model to be run - function taking in (Xtr,yTr,Xte) returning predictions
    '''
    print("Making predictions on test set")
    xTr,yTr = loadTrainData()
    xTe = loadTestData()
    preds =  model(xTr,yTr,xTe)
    my_submission = pd.DataFrame({'ID': xTe.ID, 'Horizontal_Distance_To_Fire_Points': preds})
    my_submission.to_csv('submission.csv', index=False)

def RidgeRegression(xTr,yTr,xTe,alpha = 0.05):
    '''
    Runs Ridge regression with given alpha - X is nxd and y is nx1
    '''
    ridgeRegressor = Ridge(alpha = alpha,normalize = True)
    ridgeRegressor.fit(xTr,yTr)
    return ridgeRegressor.predict(xTe)

def LassoRegression(xTr,yTr,xTe,alpha = 0.05):
    '''
    Runs Lasso regression with given alpha - X is nxd and y is nx1
    '''
    lassoRegressor = Lasso(alpha = alpha,normalize = True)
    lassoRegressor.fit(xTr,yTr)
    return lassoRegressor.predict(xTe)

def main():
    #runExperiment(RidgeRegression, featureEnginnering = True)
    #runExperiment(LassoRegression)
    #predictOnTestData(RidgeRegression)
    tuneAlpha(Ridge)
    #tuneAlpha(Lasso)
    #runCrossValidation()


# if __name__ == '__main__':
#     main()


In [3]:
df = pd.read_csv('../input/train.csv')
inverse_cols =["dist_to_hydro"]
df["dist_to_hydro"] = np.sqrt(np.square(df["Horizontal_Distance_To_Hydrology"])+
np.square(df["Vertical_Distance_To_Hydrology"]))
df = df.drop(['Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Hydrology'],axis=1)
for col in inverse_cols:
    df.loc[df[col] > 0, col] = 1/df[df[col]>0][col]
df

Unnamed: 0,ID,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Soil_Type,Horizontal_Distance_To_Fire_Points,dist_to_hydro
0,437495769,3106,347,5,1400,210,231,160,7202,1348,0.023803
1,2983472225,3132,121,1,466,221,237,152,7101,2614,0.000000
2,4070272466,3255,69,13,870,233,214,110,7756,904,0.006143
3,5686764697,3208,64,19,1473,234,199,90,7201,4831,0.002720
4,5721364516,3224,149,15,1604,239,238,124,7202,1557,0.003821
5,7144414601,3242,269,13,953,185,244,199,7202,1831,0.007918
6,7335785220,3194,158,14,1495,234,242,135,7202,1691,0.007294
7,7761296949,3259,11,22,1187,190,189,129,7700,5220,0.001903
8,8784190160,3270,96,23,1288,250,199,65,4758,1727,0.001622
9,9174673355,3177,260,20,1214,166,246,216,8776,1374,0.006018
