In [None]:
"""
Taylor Richardson
December 11, 2018.

Logistical regression is a machine learning classification algorithm that predicts the probability of a categorical dependent variable

I scaled the data to normalize the features, setting the mean to 0 and a standard deviation of 1, and used a test size of 20%.


The outcome variable denoted a 0 if the student didn't perform better on the next test than on a previous test, and 1 if they did perform better than previously.


This indicates that there are two correlations with outcome, both method and prvperf. 

The model itself is somewhat accurate with a f1 score of 59%. Based on this, we can say that the model isn’t very effective at predicting outcome, but could be useful for forecasting

"""

In [1]:
#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Load dataset
studentPerf = pd.read_csv("./studentperf.csv")

In [3]:
#Create table of mean values
table1 = np.mean(studentPerf, axis = 0)
table1

Method      2.000000
satis       4.966667
time       18.533333
ability    29.216667
prvperf    22.796667
Outcome     0.450000
dtype: float64

In [4]:
#Define x and y variables
x = studentPerf.iloc[:,0:5].values
y = studentPerf.iloc[:,5].values

In [5]:
#Load library for training
from sklearn.model_selection import train_test_split

In [6]:
#Create training and test datasets using 20% split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=100)

In [7]:
#Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [8]:
#Create the logistic regression model
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
#Create coefficient table
coeff_table = pd.DataFrame(np.transpose(logit.coef_), studentPerf.iloc[:,0:5].columns, columns = ['Coefficient'])
coeff_table

Unnamed: 0,Coefficient
Method,-0.493212
satis,-0.156839
time,-0.05926
ability,-0.226913
prvperf,0.463816


In [10]:
logit.intercept_

array([-0.06754473])

In [11]:
logit.score(x_test, y_test)

0.5833333333333334

In [12]:
#Predictions
y_pred = logit.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1], dtype=int64)

In [13]:
#Forecast table
dftable = pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
dftable

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,1
4,1,0
5,0,1
6,0,1
7,0,0
8,0,0
9,0,0


In [14]:
#Evaluate the algorithum
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['Negative Change', 'Positive Change']
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred, target_names = target_names))

[[5 3]
 [2 2]]
                 precision    recall  f1-score   support

Negative Change       0.71      0.62      0.67         8
Positive Change       0.40      0.50      0.44         4

    avg / total       0.61      0.58      0.59        12

