In [1]:
import pickle
import pandas as pd

In [2]:
!curl -o "stations_projections.pickle" "http://mas-dse-open.s3.amazonaws.com/Weather/stations_projections.pickle"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2750k  100 2750k    0     0  1192k      0  0:00:02  0:00:02 --:--:-- 1192k


In [3]:
data = pickle.load(open("stations_projections.pickle",'r'))

In [4]:
for col in [u'TAVG_coeff', u'TRANGE_coeff', u'SNWD_coeff']:
    for i in range(3):
        new_col=col+str(i+1)
        data[new_col]=[e[i] for e in list(data[col])]
    data.drop(labels=col,axis=1,inplace=True)
data.drop(labels='station',axis=1,inplace=True)
data.head(1)

Unnamed: 0,latitude,longitude,elevation,dist_coast,TAVG_coeff1,TAVG_coeff2,TAVG_coeff3,TRANGE_coeff1,TRANGE_coeff2,TRANGE_coeff3,SNWD_coeff1,SNWD_coeff2,SNWD_coeff3
0,36.0042,-119.96,73.2,107.655,3047.962363,1974.34852,150.560792,-2903.632879,-236.907268,147.021791,0.191503,0.187263,-0.040138


In [5]:
from sklearn.linear_model import LinearRegression

### Using first order terms

In [40]:
# Compute score changes
def compute_scores(y_label,X_Train,y_Train,X_test,Y_test):
    lg = LinearRegression()
    lg.fit(X_Train,y_Train)

    train_score = lg.score(X_Train,y_Train)
    test_score = lg.score(X_test,Y_test)
    print('R-squared(Coeff. of determination): Train:%.3f, Test:%.3f\n' % (train_score,test_score))

    full=set(range(4))
    for i in range(X_Train.shape[1]):
        L=list(full-set([i]))
        L.sort()
        r_train_X=X_Train[:,L]
        r_test_X=X_test[:,L]
        lg = LinearRegression()

        lg.fit(r_train_X,y_Train)
        r_train_score = lg.score(r_train_X,y_Train)
        r_test_score  = lg.score(r_test_X,Y_test)
        print "Predicting", y_label, "Column removed:",data.columns[i]
        print "Decrease in Train score: %.3f" % (train_score-r_train_score)
        print "Decrease in Test score: %.3f \n" % (test_score-r_test_score)

In [41]:
from numpy.random import rand
N=data.shape[0]
train_i = rand(N)>0.5
Train = data.ix[train_i,:]
Test  = data.ix[~train_i,:]
print data.shape,Train.shape,Test.shape

(12140, 13) (5987, 13) (6153, 13)


In [42]:
from sklearn.cross_validation import train_test_split

train_X = Train.ix[:,:4].values
test_X=Test.ix[:,:4].values
print train_X.shape, test_X.shape

for target in ["TAVG","TRANGE","SNWD"]:
    for j in range(1,4):
        y_label = target+"_coeff"+str(j)
        train_y = Train[y_label]
        test_y = Test[y_label]
        lg = LinearRegression()
        lg.fit(train_X,train_y)

        print "Target variable: ", y_label
        print "Regression Coefficients: ", ['%.2f' % i for i in lg.coef_]
        
        compute_scores(y_label, train_X, train_y, test_X, test_y)
        print "\n\n"

(5987, 4) (6153, 4)
Target variable:  TAVG_coeff1
Regression Coefficients:  ['-152.82', '-18.99', '-0.66', '-0.14']
R-squared(Coeff. of determination): Train:0.930, Test:0.932

Predicting TAVG_coeff1 Column removed: latitude
Decrease in Train score: 0.605
Decrease in Test score: 0.620 

Predicting TAVG_coeff1 Column removed: longitude
Decrease in Train score: 0.064
Decrease in Test score: 0.069 

Predicting TAVG_coeff1 Column removed: elevation
Decrease in Train score: 0.122
Decrease in Test score: 0.127 

Predicting TAVG_coeff1 Column removed: dist_coast
Decrease in Train score: 0.003
Decrease in Test score: 0.003 




Target variable:  TAVG_coeff2
Regression Coefficients:  ['-4.92', '7.63', '-0.15', '0.48']
R-squared(Coeff. of determination): Train:0.600, Test:0.588

Predicting TAVG_coeff2 Column removed: latitude
Decrease in Train score: 0.007
Decrease in Test score: 0.006 

Predicting TAVG_coeff2 Column removed: longitude
Decrease in Train score: 0.119
Decrease in Test score: 0.113

### Using second order terms

In [94]:
def compute_scores_second_order(y_label,X_Train,y_Train,X_test,Y_test,powers):
    lg = LinearRegression()
    lg.fit(X_Train,y_Train)

    train_score = lg.score(X_Train,y_Train)
    test_score = lg.score(X_test,Y_test)
    print('R-squared(Coeff. of determination): Train:%.3f, Test:%.3f\n' % (train_score,test_score))

    full=set(range(X_Train.shape[1]))
    for i in range(X_Train.shape[1]):
        L=list(full-set([i]))
        L.sort()
        r_train_X=X_Train[:,L]
        r_test_X=X_test[:,L]
        lg = LinearRegression()

        lg.fit(r_train_X,y_Train)
        r_train_score = lg.score(r_train_X,y_Train)
        r_test_score  = lg.score(r_test_X,Y_test)
        
        # Find which column was removed
        column = powers[i]
        power_1 = [i for i in range(4) if column[i] == 1]
        power_2 = [i for i in range(4) if column[i] == 2]
        if len(power_1) > 0:
            col = "*".join(list(data.columns[power_1]))
        else:
            col = "*".join(list(data.columns[power_2]))
            col = col + "*" + col

        print "Predicting", y_label, "Column removed:", col
        print "Decrease in Train score: %.5f" % (train_score-r_train_score)
        print "Decrease in Test score: %.5f \n" % (test_score-r_test_score)

In [95]:
from sklearn.preprocessing import PolynomialFeatures
second_order = PolynomialFeatures(2, include_bias=False)

In [96]:
train_X_second = second_order.fit_transform(train_X)
test_X_second = second_order.fit_transform(test_X)
powers = second_order.powers_
print train_X_second.shape, test_X_second.shape

for target in ["TAVG","TRANGE","SNWD"]:
    for j in range(1,4):
        y_label = target+"_coeff"+str(j)
        train_y = Train[y_label]
        test_y = Test[y_label]
        lg = LinearRegression()
        lg.fit(train_X_second,train_y)

        print "Target variable: ", y_label
        print "Regression Coefficients: ", ['%.2f' % i for i in lg.coef_]
        
        compute_scores_second_order(y_label, train_X_second, train_y, test_X_second, test_y, powers)
        print "\n\n"

(5987, 14) (6153, 14)
Target variable:  TAVG_coeff1
Regression Coefficients:  ['-264.54', '61.49', '-0.84', '-0.19', '-0.05', '-1.23', '-0.01', '-0.02', '0.13', '-0.01', '-0.01', '-0.00', '0.00', '-0.00']
R-squared(Coeff. of determination): Train:0.955, Test:0.959

Predicting TAVG_coeff1 Column removed: latitude
Decrease in Train score: 0.00549
Decrease in Test score: 0.00553 

Predicting TAVG_coeff1 Column removed: longitude
Decrease in Train score: 0.00041
Decrease in Test score: 0.00045 

Predicting TAVG_coeff1 Column removed: elevation
Decrease in Train score: 0.00091
Decrease in Test score: -0.00001 

Predicting TAVG_coeff1 Column removed: dist_coast
Decrease in Train score: 0.00002
Decrease in Test score: 0.00010 

Predicting TAVG_coeff1 Column removed: latitude*latitude
Decrease in Train score: 0.00000
Decrease in Test score: 0.00001 

Predicting TAVG_coeff1 Column removed: latitude*longitude
Decrease in Train score: 0.00536
Decrease in Test score: 0.00533 

Predicting TAVG_coef