In [47]:
import sys
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("gradeTrain.csv")
test_data = pd.read_csv("gradeTest.csv")

### first try with a smaller subset (data from 2018)

In [3]:
def preprocessing(data):
    nan_loc = np.argwhere(np.isnan(data))
    big_loc = np.argwhere(data>100)
    subzero_loc = np.argwhere(data<0)
    #we have: nan_loc, big_loc, subzero_loc
    for i in np.vstack((nan_loc, big_loc, subzero_loc)):
        data[i[0],i[1]] = np.mean(np.delete(data[i[0]], [i[0],i[1]]))
    return data

In [4]:
data18 = data[data.Year == 2018]
data18 = data18.drop(columns = "Year")
train18, val18 = train_test_split(data18, test_size=0.1)
y_train18 = np.array(train18["FinalExam"])   #target 
x_train18 = np.array(train18.drop(columns = "FinalExam"))  #data 

In [5]:
y_val18 = np.array(val18['FinalExam'])
x_val18 = np.array(val18.drop(columns = "FinalExam"))

In [6]:
x_train18 = preprocessing(x_train18)
x_val18 = preprocessing(x_val18)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  import sys


In [7]:
#scale = StandardScaler()
#x_train18 = scale.fit_transform(x_train18)
#x_val18 = scale.fit_transform(x_val18)

In [14]:
linreg = LinearRegression(normalize=True)

In [15]:
linreg.fit(x_train18, y_train18)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [16]:
print('The estimated intercept coefficient is %.2f' %linreg.intercept_)
print('The number of coefficients used was %d' %len(linreg.coef_))

The estimated intercept coefficient is 34.93
The number of coefficients used was 11


In [17]:
coeff_df = pd.DataFrame(train18.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(linreg.coef_)
coeff_df

Unnamed: 0,Features,Coefficient Estimate
0,0,-0.009098
1,1,-0.073012
2,2,-0.307661
3,3,0.892914
4,4,-0.345273
5,5,-0.137702
6,6,-0.313013
7,7,0.964396
8,8,-0.348924
9,9,-0.010576


In [18]:
def upperbounding(result):
    for i in np.argwhere(result> 100):
        result[i[0]] = 100
    return result 

In [19]:
pred_train18 = linreg.predict(x_train18)
pred_train18 = upperbounding(pred_train18)
pred_val18 = linreg.predict(x_val18)
pred_val18 = upperbounding(pred_val18)
print ('Fit a model with X_train, and calculate the MSE with Y_train: %.2f' % np.mean((y_train18 - pred_train18)**2))
print ('Fit a model with X_train, and calculate the MSE with X_val and Y_val: %.2f' % np.mean((y_val18 - pred_val18)**2))

Fit a model with X_train, and calculate the MSE with Y_train: 3.68
Fit a model with X_train, and calculate the MSE with X_val and Y_val: 3.30


### We now try with ALL of our data, validate with validation, and then make prediction on test set

In [37]:
train, val = train_test_split(data, test_size=0.1)

In [38]:
y_train = np.array(train.FinalExam)
y_val = np.array(val.FinalExam)

In [39]:
x_train = np.array(train.drop(columns=["FinalExam", "Year"]))
x_train = preprocessing(x_train)
x_val = np.array(val.drop(columns=["FinalExam", "Year"]))
x_val = preprocessing(x_val)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  import sys


In [40]:
# Check dimension
print("train data:", x_train.shape)
print("train target", y_train.shape)
print("validation data", x_val.shape)
print("validation target", y_val.shape)

train data: (81000, 11)
train target (81000,)
validation data (9000, 11)
validation target (9000,)


In [41]:
linreg = LinearRegression(normalize = True)
linreg.fit(x_train, y_train)

print('The estimated intercept coefficient is %.3f' %linreg.intercept_)
print('The number of coefficients used was %d' %len(linreg.coef_))

coeff_df = pd.DataFrame(train.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(linreg.coef_)
print(coeff_df)

pred_train = upperbounding(linreg.predict(x_train))

pred_val = upperbounding(linreg.predict(x_val))

print ('Fit a model with X_train, and calculate the MSE with Y_train: %.3f' % np.mean((y_train - pred_train)**2))
print ('Fit a model with X_train, and calculate the MSE with X_val and Y_val: %.3f' % np.mean((y_val - pred_val)**2))

The estimated intercept coefficient is 42.577
The number of coefficients used was 11
     Features  Coefficient Estimate
0           0             -0.010791
1           1             -0.025233
2           2             -0.018326
3           3              0.251094
4           4             -0.062614
5           5             -0.047412
6           6             -0.036276
7           7              0.284831
8           8             -0.005780
9           9              0.038815
10         10              0.128760
11  FinalExam                   NaN
12       Year                   NaN
Fit a model with X_train, and calculate the MSE with Y_train: 60.630
Fit a model with X_train, and calculate the MSE with X_val and Y_val: 59.011


In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_style('whitegrid')

#train = plt.scatter(pred_train, (pred_train - y_train), c='b', alpha=0.25)
#test = plt.scatter(pred_val, (pred_val - y_val), c='r', alpha=0.3)
#plt.hlines(y=0, xmin=65, xmax=100)
#plt.legend((train, test), ('Training', 'Test'), loc='lower left')
#plt.title('Residual Plots')

### Something does not make sense if you compare the results of 2018 with results from doing all data

In [25]:
pd.value_counts(data.Year)

2018    10000
2017    10000
2016    10000
2015    10000
2014    10000
2013    10000
2012    10000
2011    10000
2010    10000
Name: Year, dtype: int64

From above, we see that the number of observations for each year is the same, 10000
Let's run LR on each year. 

In [67]:
# LR pipeline:
def LR(data):
    train, val = train_test_split(data, test_size=0.1)
    y_train = np.array(train.FinalExam)
    y_val = np.array(val.FinalExam)
    x_train = np.array(train.drop(columns=["FinalExam", "Year"]))
    x_train = preprocessing(x_train)
    x_val = np.array(val.drop(columns=["FinalExam", "Year"]))
    x_val = preprocessing(x_val)
    
    linreg = LinearRegression(normalize=True)
    
    linreg.fit(x_train, y_train)
    
    #print('The estimated intercept coefficient is %.3f' %linreg.intercept_)
    #print('The number of coefficients used was %d' %len(linreg.coef_))

    pred_train = upperbounding(linreg.predict(x_train))
    pred_val = upperbounding(linreg.predict(x_val))
    
    train_MSE = np.mean((y_train - pred_train)**2)
    val_MSE = np.mean((y_val - pred_val)**2)
                        
    #print ('Fit a model with X_train, and calculate the MSE with Y_train: %.3f' % np.mean((y_train - pred_train)**2))
    #print ('Fit a model with X_train, and calculate the MSE with X_val and Y_val: %.3f' % np.mean((y_val - pred_val)**2))
    return {"intercept":linreg.intercept_, "coef":linreg.coef_, "train MSE": train_MSE, "val MSE":val_MSE}

In [68]:
yearly_results = {}
for year in pd.unique(data.Year):
    yearly_results[year] = LR(data[data.Year == year])

In [69]:
yearly_results

{2018: {'intercept': 34.96152486902821,
  'coef': array([-0.00828236, -0.07504017, -0.30609812,  0.89390769, -0.34770126,
         -0.13928251, -0.31075615,  0.96542066, -0.35032914, -0.0115977 ,
          0.27802369]),
  'train MSE': 3.670935175216663,
  'val MSE': 3.3098068434736363},
 2017: {'intercept': 37.49561252073815,
  'coef': array([-0.01243567, -0.06561123, -0.24446323,  0.74957975, -0.28744198,
         -0.12016764, -0.2483422 ,  0.82483677, -0.2715217 , -0.00792049,
          0.24719219]),
  'train MSE': 14.430890333794524,
  'val MSE': 14.51008799668563},
 2016: {'intercept': 38.57331722123642,
  'coef': array([-0.02956732, -0.04496149, -0.14180963,  0.54389925, -0.18239301,
         -0.11705057, -0.16746402,  0.59767475, -0.15263606,  0.02271465,
          0.20814706]),
  'train MSE': 38.46458094686716,
  'val MSE': 37.480891613283546},
 2015: {'intercept': 42.4961575588792,
  'coef': array([-0.0285657 , -0.03131612, -0.10019574,  0.38055549, -0.11991152,
         -0.074

In [70]:
pd.DataFrame(yearly_results).transpose()

Unnamed: 0,intercept,coef,train MSE,val MSE
2018,34.9615,"[-0.00828236336808925, -0.07504016732441884, -...",3.67094,3.30981
2017,37.4956,"[-0.012435673189532925, -0.06561123238504982, ...",14.4309,14.5101
2016,38.5733,"[-0.02956732176982512, -0.0449614862639975, -0...",38.4646,37.4809
2015,42.4962,"[-0.02856569731701113, -0.03131612395045009, -...",52.4413,52.4612
2014,45.7538,"[-0.020985538967690492, -0.048275582726407036,...",58.4676,58.0351
2013,49.5812,"[-0.010008014552846535, -0.019416064426393938,...",65.746,63.709
2012,51.6578,"[-0.00913155247301439, -0.0047382762137135916,...",65.0263,69.2445
2011,65.319,"[0.009434736198104634, 0.0016314797860218166, ...",65.3809,69.1047
2010,78.7327,"[0.008053843522324602, 0.011692985889231152, 0...",76.7859,77.7022


### Now, we make prediction on the Test set

In [66]:
test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,Year
0,82.4,83.2,94.7,97.8,95.0,91.6,73.4,72.7,75.0,76.7,72.1,2019
1,81.4,76.1,80.0,87.2,81.3,73.9,74.3,71.4,82.1,84.0,88.2,2019
2,78.6,70.9,81.6,70.0,74.1,72.1,80.5,84.3,88.6,90.5,87.3,2019
3,100.0,94.3,90.5,94.0,94.9,93.6,93.9,79.7,77.9,81.9,87.3,2019
4,69.0,69.8,72.1,73.2,70.0,79.8,82.1,85.3,80.6,78.5,82.3,2019
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,86.3,85.6,81.3,76.3,80.5,68.9,64.3,60.8,71.5,71.3,74.6,2019
9996,87.5,85.4,86.9,85.5,68.4,73.9,83.8,86.3,79.4,69.9,70.1,2019
9997,89.5,86.5,89.7,80.2,77.6,83.0,92.7,83.9,85.8,85.4,76.2,2019
9998,81.8,91.1,85.4,78.3,87.0,83.8,89.0,88.4,83.1,84.3,79.5,2019
