<a href="https://colab.research.google.com/github/susheelkv/ML_models/blob/main/CA_Std_Score_pycaret_ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CA Std Score pycaret ML Model

##Objective of this notebook:

Here is the code we built in the class:

* https://colab.research.google.com/drive/1baEys_qyF-qrxObKQi8WON1JMjovhVQm?usp=sharing

As a practice, build an end-to-end ML Modeling (collecting the dataset, setting ignored/numerical/categorical/target features etc) and use Pycaret to get the best ML models for the California Schools dataset.

Here is the training data URL for the CA schools:
https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/CA%20School%20Rankings%20-%20Jan%2022.csv

Once the model is built, predict the standard scores for this client dataset -
https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/CA%20schools%20-%20Client%20Dataset.csv

Please share the code once done and if you need any assistance with the code, please do not hesitate to reach out to me.

In [None]:
# Install pycaret
!pip install -U --pre pycaret

In [None]:
# Import pandas
import pandas as pd
# Import regression ML modules from pycaret
from pycaret.regression import *

# Import the dataset
url = "https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/CA%20School%20Rankings%20-%20Jan%2022.csv"
data = pd.read_csv(url)

# Display data
data

Unnamed: 0,School,Standard Score,District,City,Zip,County,Is Title I,Is Charter,Is Magnet,Is Virtual,Number Students,Number Fulltime Teachers,Student/Teacher Ratio,Percent Free/Disc Lunch,Percent Homes Rented,Percent parents with less than High School education,Percent Parents with Masters and above education,Percent Home Incomes less than Median income,Percent using Food stamps,Percent of Crimes
0,Riverside STEM Academy,99.9,Riverside Unified,Riverside,92507,Riverside County,No,No,No,No,663,25.0,26.5,0.32,0.07,0.00,0.22,0.31,0.01,0.05
1,Whitney (Gretchen) High,99.7,ABC Unified,Cerritos,90703,Los Angeles County,No,No,No,No,1009,39.2,25.7,0.25,0.02,0.00,0.76,0.15,0.00,0.03
2,William & Marian Ghidotti High,99.7,Nevada Joint Union High,Grass Valley,95945,Nevada County,No,No,No,No,169,6.1,27.3,0.28,0.01,0.01,0.05,0.07,0.00,0.05
3,Oxford Academy,99.5,Anaheim Union High,Cypress,90630,Orange County,No,No,No,No,1250,43.0,29.0,0.36,0.01,0.00,0.70,0.19,0.00,0.02
4,Lynbrook High,99.4,Fremont Union High,San Jose,95129,Santa Clara County,(n/a),No,No,No,1880,80.0,23.4,0.06,0.00,0.00,0.84,0.04,0.00,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,Juvenile Hall (Endeavor/Voyager Secondary),8.3,Madera County Superintendent of Schools,Madera,93638,Madera County,Yes,No,No,No,25,4.0,6.2,0.88,0.04,0.08,0.00,0.84,0.00,0.00
2109,Captain John Continuation High,8.2,Klamath-Trinity Joint Unified,Hoopa,95546,Humboldt County,Yes,No,No,No,33,2.0,16.5,0.82,0.00,0.91,0.00,0.00,0.00,0.00
2110,Thomas E. Mathews Community,8.1,Yuba County Office of Education,Marysville,95901,Yuba County,(n/a),No,No,No,39,2.4,15.7,0.85,0.10,0.00,0.00,0.51,0.00,0.00
2111,Wellington M. Smith Junior,7.9,Monterey County Office of Education,Salinas,93906,Monterey County,Yes,No,No,No,52,7.0,7.3,0.94,0.00,0.00,0.04,0.90,0.00,0.00


In [None]:
# Display the columns
data.columns

Index(['School', 'Standard Score', 'District', 'City', 'Zip', 'County',
       'Is Title I', 'Is Charter', 'Is Magnet', 'Is Virtual',
       'Number Students', 'Number Fulltime Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent parents with less than High School education',
       'Percent Parents with Masters and above education',
       'Percent Home Incomes less than Median income',
       'Percent using Food stamps', 'Percent of Crimes'],
      dtype='object')

In [None]:
# Setting target variable
y = 'Standard Score'

In [None]:
# Setting the ignored feature list (noise / redundant / feeder variables & variables with more than 20% missing data)
#ignored_cols = ['School', 'School URL', 'District URL', 'City URL',
#                'Phone', 'Home Prices Ranking', 'Community Health Ranking']
ignored_cols = ['School', 'Percent using Food stamps', 'Percent of Crimes']

In [None]:
# Set the Categorical features
#cat_cols = ['District', 'City', 'Zip', 'County', 'Is Title I',
#            'Is Charter', 'Is Magnet', 'Is Virtual']
cat_cols = ['District', 'City', 'Zip', 'County', 'Is Title I',
            'Is Charter', 'Is Magnet', 'Is Virtual',]

In [None]:
# Set the numerical features list
#num_cols = ['Number Students', 'Number Full-time Teachers',
#       'Student/Teacher Ratio', 'Percent Free/Disc Lunch',
#       'Percent Homes Rented', 'Percent of Unemployment',
#       'Percent Parents with Masters and above education',
#       'Percent parents with less than High School education',
#       'Percent Home Incomes with less than Median income']
num_cols = ['Number Students', 'Number Fulltime Teachers',
           'Student/Teacher Ratio', 'Percent Free/Disc Lunch',
            'Percent Homes Rented',
            'Percent Parents with Masters and above education',
            'Percent parents with less than High School education',
            'Percent Home Incomes less than Median income']

In [None]:
# Setting up or configuring the ML experiment
regression_setup = setup(data,
                         target=y,
                         categorical_features=cat_cols,
                         numeric_features=num_cols,
                         ignore_features=ignored_cols)

Unnamed: 0,Description,Value
0,Session id,7649
1,Target,Standard Score
2,Target type,Regression
3,Original data shape,"(2113, 20)"
4,Transformed data shape,"(2113, 19)"
5,Transformed train set shape,"(1479, 19)"
6,Transformed test set shape,"(634, 19)"
7,Ignore features,3
8,Numeric features,8
9,Categorical features,8


In [None]:
# Invoking ML algorithms
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,15.7859,405.8015,20.1255,0.4554,0.5314,0.548,0.555
lightgbm,Light Gradient Boosting Machine,15.4315,419.5229,20.4387,0.4372,0.5315,0.5367,0.405
ada,AdaBoost Regressor,16.9602,431.039,20.747,0.4212,0.5711,0.65,0.351
xgboost,Extreme Gradient Boosting,16.1564,459.5951,21.3725,0.3826,0.5509,0.5609,0.594
et,Extra Trees Regressor,16.4359,468.6063,21.6223,0.3713,0.5577,0.5866,0.985
lar,Least Angle Regression,18.2398,499.6072,22.3275,0.3298,0.6143,0.6691,0.178
lr,Linear Regression,18.2874,502.0194,22.3804,0.3265,0.6149,0.6711,1.451
ridge,Ridge Regression,18.4287,509.6112,22.5494,0.3163,0.6162,0.6765,0.265
br,Bayesian Ridge,18.5442,515.9097,22.6886,0.3079,0.6175,0.6807,0.185
rf,Random Forest Regressor,17.3482,520.0944,22.7567,0.3021,0.5731,0.6122,1.352


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [None]:
# Create a model using the top rated model
# This is also called Cross Validation which runs the model over 10 iterations or folds
# fold is configurable as part of the regression_setup
# Note: see how the MAE, MSE, RMSE value matches with the previous
model = create_model("gbr")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17.4476,477.2754,21.8466,0.4123,0.5611,0.5746
1,15.3449,366.0451,19.1323,0.4853,0.543,0.5955
2,15.6013,374.6707,19.3564,0.5232,0.533,0.5981
3,16.0566,403.2133,20.0802,0.3896,0.5058,0.5101
4,15.6662,394.8547,19.871,0.4751,0.5213,0.4986
5,14.5301,359.5448,18.9617,0.5234,0.5143,0.5361
6,15.9876,445.817,21.1144,0.37,0.521,0.5199
7,16.2542,430.0913,20.7386,0.4079,0.5375,0.5458
8,15.5594,420.1308,20.4971,0.4768,0.5279,0.5083
9,15.4111,386.3719,19.6563,0.4903,0.5495,0.5928


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Importing the client data for which we need to do the standard score predictions
client_url = "https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/CA%20High%20Schools%20-%20Client%20Data.csv"
#client_file = "/content/CA schools - Client Dataset.csv"
#client_data = pd.read_csv(client_file)
client_data = pd.read_csv(client_url)
client_data.columns

Index(['School', 'District', 'City', 'Zip', 'County', 'Is Title I',
       'Is Charter', 'Is Magnet', 'Is Virtual', 'Number Students',
       'Number Fulltime Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent parents with less than High School education',
       'Percent Parents with Masters and above education',
       'Percent Home Incomes less than Median income',
       'Percent using Food stamps', 'Percent of Crimes',
       'Percent of Crimes.1'],
      dtype='object')

In [None]:
# Predicting standard scores using the ML Model
client_data_predictions = predict_model(model, client_data)

In [None]:
# Converting pandas file to a csv file
client_data_predictions.to_csv("CA High School Scores.csv")

In [None]:
# Print the scores
client_data_predictions['prediction_label']

0      50.160241
1      99.354988
2      74.184286
3      76.469332
4      71.839114
         ...    
472    12.567850
473    11.890247
474    10.086017
475    11.531273
476    18.244985
Name: prediction_label, Length: 477, dtype: float64