In [1]:
#Loading csv with processed text and csv with feature_vecs (100 word embedding vectors + dummy variables for essay set)
import pandas as pd
import numpy as np
df = pd.read_csv('data\interim\processed_text.csv', index_col=0)
X = pd.read_csv('data\interim\\feature_vecs.csv', index_col=0)

In [2]:
#Scaling feature_vectors
from sklearn.preprocessing import MinMaxScaler
scaler_x = MinMaxScaler()
X = scaler_x.fit_transform(X)

In [3]:
from sklearn.externals import joblib

#Scaling essay sets individually depending on maximum score
scaler_y = MinMaxScaler()
y_scaled = []

for set_id in range(1,9):
    essay_scores = df.loc[df['essay_set']==set_id,'domain1_score'].values.reshape(-1,1)
    scaled_scores = scaler_y.fit_transform(essay_scores)
    y_scaled.append(scaled_scores)
    
    scaler_filename = 'src\scalers\scaler{}.pkl'.format(set_id)
    joblib.dump(scaler_y, scaler_filename)
        
y_scaled = np.concatenate(y_scaled).flatten()
essay_set = df.essay_set.values

y = pd.DataFrame({'essay_set':essay_set, 'scaled':y_scaled})

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=728)

In [None]:
#Training gradientboostingregressor
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[50, 100, 500, 1000], 'max_depth':[2], 'min_samples_split': [2],
          'learning_rate':[1, 0.1, 0.3, 0.01], 'loss': ['ls']}

gbr = ensemble.GradientBoostingRegressor()
grid = GridSearchCV(gbr, params, cv=3)
grid.fit(X_train, y_train.scaled)

y_pred = grid.predict(X_test)

# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_)

In [7]:
model = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1000, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
model.fit(X_train,y_train.scaled)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1000, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
#Scaled original and predicted values
y_pred_scaled = model.predict(X_test)
y_test_scaled = y_test.scaled.values

#Reversing transformation for original and predicted based on the essay_set
y_test_unscaled = np.zeros(len(y_test_scaled))
y_pred_unscaled = np.zeros(len(y_pred_scaled))

for set_id in range(1,9):
    scaler_filename = 'src\scalers\scaler{}.pkl'.format(set_id)
    scaler = joblib.load(scaler_filename)
    mask = (y_test.essay_set == set_id)
    y_test_unscaled[mask] = scaler.inverse_transform(y_test_scaled[mask].reshape(-1,1)).flatten()
    y_pred_unscaled[mask] = scaler.inverse_transform(y_pred_scaled[mask].reshape(-1,1)).flatten()

#Needed reshaping
y_test_unscaled = y_test_unscaled.reshape(-1,)
y_pred_unscaled = y_pred_unscaled.reshape(-1,)

#Creating dataframe, updating columns, and saving to interim data folder for later exploration
predictions = pd.DataFrame({'essay_set': y_test.essay_set, 'pred_scaled': y_pred_scaled,
                            'pred_unscaled': y_pred_unscaled,'orig_scaled': y_test_scaled,
                            'orig_unscaled': y_test_unscaled}, index=y_test.index)
    
predictions.to_csv('data\interim\essay_dependent_scaler_results.csv')

In [10]:
predictions

Unnamed: 0,essay_set,pred_scaled,pred_unscaled,orig_scaled,orig_unscaled
6970,4,0.922241,2.766722,1.000000,3.0
6711,4,0.434321,1.302963,0.333333,1.0
9297,6,0.935175,3.740700,0.750000,3.0
4826,3,0.694638,2.083914,0.666667,2.0
10903,7,0.640636,16.093996,0.545455,14.0
76,1,0.762088,9.620884,0.800000,10.0
12868,8,0.737763,46.888155,0.400000,30.0
2342,2,0.531853,3.659263,0.600000,4.0
5842,4,0.821883,2.465650,1.000000,3.0
10116,6,0.481825,1.927299,0.500000,2.0
