# setup and data

In [188]:
import numpy as np

import pandas as pd
from pandas import Series,DataFrame
from bokeh.layouts import gridplot, row, column

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure

from sklearn.datasets import load_boston

from bokeh.models import (BasicTicker, ColumnDataSource, Grid, LinearAxis,
                         DataRange1d, PanTool, Plot, WheelZoomTool)
import sklearn.model_selection

# For very simple visualizations (without too much interaction, output_notebook works too, which we enable here)
output_notebook()

In [189]:
# Load dataset
df18 = pd.read_csv('../Data_csv/data_2018.csv')
for columnname in df18:
    print(columnname)
df18['stats_pc_intl_students'] = df18['stats_pc_intl_students'].map(lambda x: x.rstrip('%'))
df18 = df18
        
rank = df18['rank']
location = df18['location']
percentage_male = df18['percentage_male']
stats_number_students = df18['stats_number_students']
stats_student_staff_ratio = df18['stats_student_staff_ratio']

# Data Columns
df18new = df18.drop('url', axis=1)
df18new = df18new.drop('subjects_offered', axis=1)
df18new = df18new.drop('name', axis=1)
df18new = df18new.drop('location', axis=1)
df18new = df18new.drop('male_students', axis=1)
df18new = df18new.drop('nid', axis=1)
df18new = df18new.drop('scores_industry_income', axis=1)
df18new = df18new.drop('scores_international_outlook', axis=1)
df18new = df18new.drop('scores_research', axis=1)
df18new = df18new.drop('scores_teaching', axis=1)
df18new = df18new.drop('scores_citations', axis=1)
df18new = df18new.drop('Unnamed: 0', axis=1)
df18new = df18new.drop('rank', axis=1)
df18new = df18new.drop('scores_overall', axis=1)

columnlist = [location, percentage_male, stats_number_students, stats_student_staff_ratio, rank]

Unnamed: 0
location
name
nid
rank
rank_order
scores_citations
scores_citations_rank
scores_industry_income
scores_industry_income_rank
scores_international_outlook
scores_international_outlook_rank
scores_overall
scores_research
scores_research_rank
scores_teaching
scores_teaching_rank
percentage_male
stats_number_students
stats_pc_intl_students
stats_student_staff_ratio
subjects_offered
url
male_students


In [207]:
# You can see what these names mean in the description that we printed at the start
# print(df18.feature_names)


figures = [figure() for _ in range(1,10)]
for fig, column in zip(figures, df18new):
    

    
    # compute line
    
    Y = df18new.rank_order
    X = np.vstack(df18new[column])
    X = np.column_stack((X, np.ones(X.shape[0])))

    a, b = np.linalg.lstsq(X, Y)[0]
    
    # Create a scatter-plot
    fig.scatter(df18new[column], df18new["rank_order"])
    fig.line(x, a * x + b, color='red')
    
    # set yrange to always from 0 to 1103
    fig.y_range=DataRange1d(start=0, end=1103)
#     fig.x_range=DataRange1d(start=0, end=100)
    
    ## Add some axis information
    fig.xaxis.axis_label = column
    fig.yaxis.axis_label = 'rank'

    
show(gridplot(figures, ncols=2, plot_width=400, plot_height=250, toolbar_location=None))

# linear regression

## univariate linear regression

In [191]:
Y = df18.rank_order
X = np.vstack(df18new.scores_citations_rank)
X = np.column_stack((X, np.ones(X.shape[0])))

a, b = np.linalg.lstsq(X, Y)[0]

f = figure(plot_width=400, plot_height=250)

# Create a scatter-plot
f.scatter(df18new["scores_citations_rank"], df18new["rank_order"])
    
# Create the line
x = df18new['scores_citations_rank']
f.line(x, a * x + b, color='red')

## Add some axis information
f.xaxis.axis_label = "scores_citations_rank"
f.yaxis.axis_label = "overall rank"

show(f)

## computing the error

In [203]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmsecolumn = []

# calculate all mse's
for column in df18:
    try:
        rmse = sqrt(mean_squared_error(np.array(df18[column]),df18['rank_order']))
#         print(mse)
        rmsecolumn.append((rmse,column))
    except ValueError:
        print(column + ' ..whoeps')
        

# find lowest
sorted(rmsecolumn)

# r = np.array(df18['rank_order'])
# y = np.array(df18['rank'])
# mse = mean_squared_error(r, y)
# print(mse)

location ..whoeps
name ..whoeps
rank ..whoeps
scores_overall ..whoeps
subjects_offered ..whoeps
url ..whoeps
male_students ..whoeps


[(0.0, 'rank_order'),
 (1.0, 'Unnamed: 0'),
 (123.84283608982243, 'scores_citations_rank'),
 (166.87504350578888, 'scores_research_rank'),
 (213.82318722341356, 'scores_teaching_rank'),
 (254.91804809010108, 'scores_international_outlook_rank'),
 (337.8384921549127, 'scores_industry_income_rank'),
 (593.3496620005712, 'percentage_male'),
 (602.3448932514582, 'scores_industry_income'),
 (605.5383674628952, 'scores_international_outlook'),
 (610.0417212446306, 'scores_citations'),
 (617.3422852537194, 'scores_teaching'),
 (620.6703183054474, 'stats_student_staff_ratio'),
 (623.7090928996705, 'scores_research'),
 (630.2054429088863, 'stats_pc_intl_students'),
 (62512.72975110528, 'stats_number_students'),
 (277221.3194803922, 'nid')]

# multivariate regression

In [193]:
# Import for Linear Regression
import sklearn
from sklearn.linear_model import LinearRegression

# Create a LinearRegression Object
lreg = LinearRegression()

In [194]:
X_multi = df18new.drop('rank_order',axis=1)

for name in df18new:
    print(name)

# Targets
Y_target = df18new['rank_order']

# Implement Linear Regression
lreg.fit(X_multi,Y_target)

rank_order
scores_citations_rank
scores_industry_income_rank
scores_international_outlook_rank
scores_research_rank
scores_teaching_rank
percentage_male
stats_number_students
stats_pc_intl_students
stats_student_staff_ratio


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [195]:
# What is our constant? (b in univariate regression)
print(' The estimated intercept coefficient is {0:.2f}'.format(lreg.intercept_))

print(' The number of coefficients used was {0:d}'.format(len(lreg.coef_)))

 The estimated intercept coefficient is -71.96
 The number of coefficients used was 9


In [196]:
coeff_df = DataFrame(df18new.drop('rank_order',axis=1).columns)
coeff_df.columns = ['features']

coeff_df["Coefficient Estimate"] = pd.Series(lreg.coef_)

coeff_df.sort_values(by='Coefficient Estimate', ascending=False)

Unnamed: 0,features,Coefficient Estimate
0,scores_citations_rank,0.547218
7,stats_pc_intl_students,0.430895
3,scores_research_rank,0.254129
4,scores_teaching_rank,0.186745
2,scores_international_outlook_rank,0.146603
1,scores_industry_income_rank,0.008732
6,stats_number_students,-2.4e-05
8,stats_student_staff_ratio,-0.035137
5,percentage_male,-0.219676


In [197]:
X = np.vstack(df18new.scores_research_rank)
X = np.column_stack((X, np.ones(X.shape[0])))

## training and test set

In [217]:
import sklearn.model_selection
# Grab the output and set as X and Y test and train data sets!
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_multi,df18new.rank_order)

# Print shapes of the training and testing data sets
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
# print(X_train)

(827, 9) (276, 9) (827,) (276,)


## the prediction

In [218]:
# Create our regression object
lreg = LinearRegression()

# Once again do a linear regression, except only on the training sets this time
lreg.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [219]:
# Predictions on training and testing sets
pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)

In [220]:
from sklearn.metrics import mean_squared_error
# mse = mean_squared_error(r, y)

print("Fit a model X_train, and calculate MSE with Y_train: {0:.2f}".format(mean_squared_error(Y_train, pred_train)))
    
print("Fit a model X_train, and calculate MSE with X_test and Y_test: {0:.2f}".format(mean_squared_error(Y_test, pred_test)))

Fit a model X_train, and calculate MSE with Y_train: 3696.59
Fit a model X_train, and calculate MSE with X_test and Y_test: 3826.17
