In [89]:
import math
import numpy as np
from sklearn import linear_model
import time
X = np.array([[ 1, 4,90],
              [10, 5,8],
              [ 3, 9,10],
             [23,23,234]])
Y=np.array([1,2,3,10])

In [77]:
# this piece of code is copied directly from the blog. this is just a decorator to measure the performance
import time
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print (  '{}  {} ms'.format(method.__name__, (te - ts) * 1000) )
        return result
    return timed

In [97]:
# Simple Least Squared Method Normal Equation
#Linear Regression 
class LinearRegression():
    def __init__(self,Y,X):
        #Y is an vecotor, X is a matrix, they are numpy object
        self.nobs=Y.shape[0]
        self.nvar=X.shape[1]+1 # intercept included
                
        if self.nobs<self.nvar:                
            raise ValueError("Degree of freedom is less than 0, impossible to estimate all parameters")
        if np.linalg.matrix_rank(X)!=min(X.shape): 
            raise ValueError("Matrix X does not have full rank.Only non-singular matrix apply this method.")     
        if Y.shape[0]!=X.shape[0]: 
            raise ValueError("Their dims don't match. nrow of Y={}, nrow of X={}".format(Y.shape[0],X.shape[0]))       
        
        self.Y=Y
        self.X=np.insert(X,0,1,axis=1) # insert intercept into first column
        self.result={}     
            
    def __str__(self):
        return("Linear Regression Model: \nnumber of observations={}\nnumber of variables={} (intercept included)".format(self.Y.shape[0],self.X.shape[1]))
    def __repr__(self):
        return(self.__str__())
   
    @timeit
    def NormalEquation(self):
        ''' OLS normal equation method ： this formular require X to be full rank '''
        # run the complete regression include all the variables           
        X = self.X
        XtX_inv=np.linalg.inv(np.dot(X.T,X))
        self.result["OLS_Coefficients"]=np.dot(np.dot(XtX_inv,X.T),Y)
        self.result["OLS_Y_predict"]=np.dot(self.X,self.result["OLS_Coefficients"])
        self.result["OLS_Residuals"]=self.Y-self.result["OLS_Y_predict"]
        self.result["OLS_AIC"]=self.nobs*np.power(self.result["OLS_Residuals"],2).sum()+self.nvar*2
#         self.result["OLS_Y_predict"]=
        for keys,values in self.result.items():
            print(keys)
            print(values)
        
       
    
mymodel=LinearRegression(Y=Y,X=X)
mymodel.NormalEquation()

OLS_Coefficients
[-0.66389074  0.07519987  0.38124584  0.00070786]
OLS_Y_predict
[  1.   2.   3.  10.]
OLS_Residuals
[  1.07691633e-14  -8.88178420e-15   1.37667655e-14  -5.32907052e-15]
OLS_AIC
8.0


In [4]:
mymodel.GetResidual(coe=[1,1,1,1])

array([ -95,  -22,  -20, -271])

In [92]:
class ForwardSelection(LinearRegression):
    '''step-wise selection of variables, each time put the most correlated variable into the model   '''
    def __init__(self,Y,X):
        LinearRegression.__init__(self,Y=Y,X=X)
        self.active=[]
        self.inactive=[i for i in range(self.nvar)]
        self.coeset={0:np.array([0 for i in range(self.nvar)])}# initialize the first round coe
    
    def OLS(self,X=None):
        ''' OLS normal equation method ： this formular require X to be full rank '''  
        X= self.X if X is None else X
        XtX_inv=np.linalg.inv(np.dot(X.T,X))
        self.coe=np.dot(np.dot(XtX_inv,X.T),Y)
        return(self.coe)
    
    def GetResidual(self,coe=None):
        coe = self.coe if coe is None else coe
        if coe is None:
            coe = ForwardSelection.OLS(self)  
        return(self.Y-np.dot(self.X,coe)) 
               
    def stepwise(self):
        for j in range(self.nvar):
            last_residual=self.GetResidual(coe=self.coeset[j]) # residual from last run
            list_of_cov=[np.dot(self.X[:,i],last_residual) for i in self.inactive]  # see the all the covariance of redisual and inactive variables
            nextvar=self.inactive[list_of_cov.index(max(list_of_cov))] # choose the variable that is most correlated with the residuals
            
            # move this variable from inactive set to active set
            self.active.append(nextvar) 
            self.inactive.remove(nextvar)
            
            X=self.X[:,self.active]#picking all the variables in active set
            newcoe=ForwardSelection.OLS(self,X=X)# use this slice of X to run linear regression and get coeffecients
            # but the form [c,d,a,b] is not what we want, we would like to have form like this [0,a,0,b,c,d,0,0]
            finalcoe=[None for i in range(self.nvar)] 
            for i in range(len(finalcoe)):
                finalcoe[i]= newcoe[self.active.index(i)] if i in self.active else 0
            # store coeffecient result in a dict            
            self.coeset[j+1]=finalcoe
        return(self.coeset)              
        
mymodel=ForwardSelection(Y=Y,X=X)
mymodel.stepwise()

{0: array([0, 0, 0, 0]),
 1: [0, 0, 0, 0.039289114566804187],
 2: [0, 0.28143477550791052, 0, 0.014361013695887676],
 3: [0, 0.087098869903475748, 0.31256846241576219, 0.0024502078212224887],
 4: [-0.66389073950699551,
  0.075199866755496447,
  0.38124583610926144,
  0.00070786142571611574]}

In [99]:
reg = linear_model.LinearRegression()
start_time = time.time()
reg.fit (X, Y)
print(reg.coef_)
print("--- %s seconds ---" % (time.time() - start_time))

[ 0.07519987  0.38124584  0.00070786]
--- 0.04764533042907715 seconds ---


In [115]:
import pandas as pd
import os
# train = pd.read_csv("C:\\Users\\Wesle\\python_project\\train.csv")
ep=os.path.expanduser('~/python_project/train_sample.csv')
os.path.normpath(ep)
train = pd.read_csv(os.path.normpath(ep))

In [117]:
X=train.filter(regex="weatherVar[0-9]{1,2}$")
X.head()

Unnamed: 0,weatherVar1,weatherVar2,weatherVar3,weatherVar4,weatherVar5,weatherVar6,weatherVar7,weatherVar8,weatherVar9,weatherVar10,...,weatherVar90,weatherVar91,weatherVar92,weatherVar93,weatherVar94,weatherVar95,weatherVar96,weatherVar97,weatherVar98,weatherVar99
0,7.95166,0.0,0.882133,1.635643,0.420159,0.923142,0.663614,9.497982,28.999883,0.567442,...,1.014915,0.150259,0.887304,0.051609,0.288339,0.0,0.0,0.0,1.00085,0.0
1,0.446103,0.0,1.147721,0.009012,1.157393,0.015541,1.187234,0.609595,0.0,1.182801,...,0.022423,1.133088,0.068884,1.390399,1.179081,0.0,1.330108,0.0,0.08996,3.864054
2,0.204195,0.0,0.919459,0.574934,1.246018,1.247934,0.949284,0.0,0.0,0.975163,...,0.537857,1.199227,1.186464,1.065602,0.930882,1.061094,0.430091,1.552867,0.937563,0.0
3,0.361528,0.0,0.983296,0.275168,0.200429,0.146865,0.632942,0.0,0.0,0.18862,...,0.079324,0.23315,0.834495,0.30885,0.264988,0.794054,0.147524,0.0,0.409568,5.60017
4,0.205732,0.0,1.128833,0.390005,1.222026,0.003105,1.200807,0.0,0.0,1.212844,...,0.025181,0.939496,0.125041,1.357594,1.17548,0.0,0.59836,0.0,0.128166,0.0


In [118]:
X=sample.filter(regex="weatherVar[0-9]{1}$").values
Y=sample['target'].values
sampledata=np.insert(X,0,Y,axis=1)
sampledata=sampledata[~np.isnan(sampledata).any(1)]
# X=X[~np.isnan(X).any(1)]
# X.shape
Y=sampledata[:,0]
X=sampledata[:,1:]
X.shape

(4486, 9)

In [103]:
pd.DataFrame(sampledata).to_csv("sampledata.csv")

In [105]:
reg = linear_model.LinearRegression()
start_time = time.time()
reg.fit (sampledata[:,1:], sampledata[:,0])
print(reg.coef_)
print("--- %s seconds ---" % (time.time() - start_time))

[ 0.00051525 -0.0005168   0.02991231 -0.00027858  0.01538864 -0.00103764
  0.01178    -0.00017288 -0.0005513 ]
--- 0.03363490104675293 seconds ---


In [107]:
mymodel=LinearRegression(Y=sampledata[:,0],X=sampledata[:,1:])
mymodel.NormalEquation()

OLS_Coefficients
[-0.04368254  0.00051525 -0.0005168   0.02991231 -0.00027858  0.01538864
 -0.00103764  0.01178    -0.00017288 -0.0005513 ]
OLS_Y_predict
[ 0.0053151   0.03001741  0.00991091 ...,  0.01508422  0.01496253
  0.01732354]
OLS_Residuals
[-0.0053151  -0.03001741 -0.00991091 ..., -0.01508422 -0.01496253
 -0.01732354]
OLS_AIC
1344374.76639


In [110]:
mymodel=ForwardSelection(Y=sampledata[:,0],X=sampledata[:,1:])
mymodel.stepwise()

{0: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 1: [0, 0, 0, 0, 0, 0.011737955661925127, 0, 0, 0, 0],
 2: [0, 0, 0, 0, 0, 0.0078184742284703873, 0, 0.004282513848635738, 0, 0],
 3: [0,
  0,
  0,
  -0.03162173489819077,
  0,
  0.0041368256307888068,
  0,
  0.039466581217143359,
  0,
  0],
 4: [0,
  0,
  0,
  -0.033649020514565503,
  0,
  0.0041301428429678707,
  0,
  0.041445120069362455,
  5.3526928831578642e-05,
  0],
 5: [-0.04916427518150189,
  0,
  0,
  0.027575437726332537,
  0,
  0.012795951992400917,
  0,
  0.020220893689558143,
  -0.00011744839680865284,
  0],
 6: [-0.04846190381639668,
  -0.00015742458274609636,
  0,
  0.027078558297721603,
  0,
  0.012793044779213445,
  0,
  0.02016148172344813,
  -0.00010714395698351666,
  0],
 7: [-0.04453418647290841,
  -0.00011122814773152395,
  0,
  0.023025088717858834,
  -0.00049956962163291566,
  0.012625944482368895,
  0,
  0.020929449221511298,
  -0.0001027676078705982,
  0],
 8: [-0.042653562701110127,
  -0.00026749346052280744,
  0,
 