# Classification on College Score Board Data

In [1]:
import pickle
from sklearn.linear_model    import SGDClassifier
from sklearn.linear_model    import LogisticRegression
from sklearn.metrics         import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
import numpy as np

## Import the data

In [2]:
college = pickle.load(open("college.p","rb"))
college_label = pickle.load(open("college_label.p","rb"))
college_test = pickle.load(open("college_test.p","rb"))
college_test_label = pickle.load(open("college_test_label.p","rb"))

## Encode earnings into categorical variable

In [4]:
quantiles = college_label.quantile(q=0.5)

In [5]:
quantiles #median earnings

30600.0

In [6]:
def median_code(data):
    if data < quantiles:
        return 0
    else:
        return 1

In [26]:
college_label = college_label.map(median_code)

In [7]:
college_test_label = college_test_label.map(median_code)

# Create a LogisticRegression Model

In [8]:
log_clf = LogisticRegression(solver = 'liblinear',
                            random_state = 42)

In [9]:
train_fit = log_clf.fit(college,college_label)

In [31]:
train_predict = train_fit.predict(college)

## Model Evaluation

In [71]:
skfolds = StratifiedKFold(n_splits = 3, random_state = 42)
for train_index, test_index in skfolds.split(college,college_label):
    clone_clf = clone(log_clf)
    X_train_folds = college[train_index]
    y_train_folds = (np.array(college_label)[train_index])
    X_test_fold = college[test_index]
    y_test_fold = np.array(college_label)[test_index]
    
    clone_clf.fit(X_train_folds,y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct/len(y_pred))

0.782907049283
0.842072409488
0.73720349563


In [72]:
np.array(college_label)

array([0, 1, 1, ..., 1, 1, 1])

In [63]:
y_train_folds

1440    0.0
1442    1.0
1443    0.0
1444    NaN
1447    0.0
1448    1.0
1450    NaN
1457    NaN
1458    NaN
1459    NaN
1460    0.0
1464    NaN
1465    0.0
1467    NaN
1468    0.0
1469    NaN
1470    0.0
1471    1.0
1472    1.0
1474    NaN
1476    1.0
1477    1.0
1479    0.0
1482    1.0
1483    1.0
1484    NaN
1487    1.0
1491    NaN
1493    0.0
1495    0.0
       ... 
4777    1.0
4778    0.0
4779    0.0
4780    0.0
4781    1.0
4782    NaN
4783    1.0
4784    NaN
4785    0.0
4786    NaN
4787    0.0
4788    0.0
4789    0.0
4790    NaN
4791    0.0
4792    1.0
4793    0.0
4794    0.0
4795    NaN
4796    NaN
4797    0.0
4798    NaN
4799    NaN
4800    0.0
4801    0.0
4802    NaN
4803    0.0
4804    1.0
4805    0.0
4806    0.0
Name: MDEarning10, dtype: float64

In [42]:
confusion_matrix(college_label,train_predict)

array([[2120,  283],
       [ 460, 1944]])