## Model and Feature Selection

### Notebook Contents:

- [Reading in the Data](#reading2)
- [Defining X and y](#features2)
- [Support Vector Regressor Model](#SVR)

In [None]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer

<a class="anchor" id="reading2"></a>

### Reading in the Data

In [6]:
# read the csv file with log scaled data
df = pd.read_csv('data/log_per_student.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
df.head(2)

Unnamed: 0,leaid,name,stabbr,agchrt,v33,totalrev,tfedrev,c14,c15,c16,...,w01,w31,w61,v95,v02,k14,ce1,ce2,ce3,graduation rate
0,<<<<<<< HEAD,,,,,,,,,,...,,,,,,,,,,
1,2700001,MOUNTAIN IRON-BUHL,MN,3.0,507.0,9.684527,6.770554,5.84403,0.0,3.244194,...,6.04522,0.0,10.269527,5.972549,4.529392,5.201033,0.0,0.0,0.0,0.9355


In [8]:
df.columns = df.columns.str.lower() #make column names lower-case

In [9]:
df.set_index(df['leaid'], inplace = True) #set leaid column as index

In [10]:
dummies = pd.get_dummies(df['agchrt'], drop_first = True).astype('float64') # dummify agchrt column

In [11]:
df = pd.concat([df, dummies], axis = 1) # add to the dataframe

In [12]:
df.isnull().sum() # check for null values

leaid              0
name               3
stabbr             3
agchrt             3
v33                3
                  ..
ce2                3
ce3                3
graduation rate    3
2.0                0
3.0                0
Length: 133, dtype: int64

In [24]:
df.dropna(axis = 0, inplace = True) #drop null values

<a class="anchor" id="features2"></a>

### Define X and y

In [31]:
# numeric X Features (excl stabbr)
X = df[[2, 3,'tfedrev', 'tstrev', 'a13', 't06', 'a11', 'u30', 'totalexp', 't40', 
        'v93', 'z33', 'z35', 'z36', 'z38', 'z37', 'v11', 'v13', 'v17', 'v37', 'v10', 'v12', 'v14', 
        'v18', 'v24', 'v38', 'w01', 'w31', 'w61', '_19h', '_21f', '_41f', '_61v']]

y = df['graduation rate']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y) #train test split the variables

In [33]:
# data types (the only object is district name column)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16848 entries, 3806930 to 2201920
Data columns (total 33 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   2.0       16848 non-null  float64
 1   3.0       16848 non-null  float64
 2   tfedrev   16848 non-null  float64
 3   tstrev    16848 non-null  float64
 4   a13       16848 non-null  float64
 5   t06       16848 non-null  float64
 6   a11       16848 non-null  float64
 7   u30       16848 non-null  float64
 8   totalexp  16848 non-null  float64
 9   t40       16848 non-null  float64
 10  v93       16848 non-null  float64
 11  z33       16848 non-null  float64
 12  z35       16848 non-null  float64
 13  z36       16848 non-null  float64
 14  z38       16848 non-null  float64
 15  z37       16848 non-null  float64
 16  v11       16848 non-null  float64
 17  v13       16848 non-null  float64
 18  v17       16848 non-null  float64
 19  v37       16848 non-null  float64
 20  v10       16848 non-null 

<a class="anchor" id="SVR"></a>

### Pipe: StandardScaler / SVR

In [36]:
# start a pipe
pipe_svr = make_pipeline(StandardScaler(), SVR())

In [37]:
pipe_svr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svr', SVR())])

In [38]:
pipe_svr.score(X_train, y_train)

0.5812180307602167

In [39]:
pipe_svr.score(X_test, y_test)

0.34499068602023386

In [40]:
# RMSE
np.round(mean_squared_error(y_test, pipe_svr.predict(X_test), squared = False), 4)

0.0968