# Final Project
## Authors:
- Taylor Tucker
- Virginia Weston
- Tina Jin
- Jeffrey Bradley

## Code for decision trees (classification and regression) and random forest.

Import statements

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

Importing the dataset

In [3]:
df = pd.read_csv("./cleaned_data.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Number of Bachelor's Degrees,Percent Financial Aid,Average Amount of Aid,Retention Rate,Enrollment,Percent Women,Percent In State,Percent Out of State,Percent Foreign,...,Graduation Rate,Percent Awarded,Total Staff,Instructional Staff,SA Staff,Librarian Staff,Percent Books,Percent Digital,Percent Admitted,Total Price
0,0,208.0,100.0,32400.0,79.0,996,99.0,59.0,36.0,4.0,...,69.0,66.0,357.0,105.0,56.0,62.0,41,12,70.0,55625.0
1,1,310.0,100.0,40855.0,75.0,1533,54.0,66.0,32.0,1.0,...,64.0,61.0,435.0,132.0,21.0,27.0,37,54,68.0,59470.0
2,2,398.0,100.0,39796.0,68.0,1912,60.0,53.0,46.0,1.0,...,51.0,48.0,355.0,123.0,17.0,21.0,28,13,62.0,60636.0
3,3,382.0,100.0,38689.0,82.0,1771,56.0,50.0,45.0,4.0,...,74.0,70.0,426.0,160.0,41.0,50.0,27,46,64.0,63180.0
4,4,61.0,97.0,10055.0,37.0,698,45.0,64.0,34.0,0.0,...,31.0,10.0,115.0,41.0,4.0,7.0,20,76,64.0,23170.0


I cannot use classifier to guess a continuous target variable. Therefore, for the classifier models, I will need to create
different classes for the target. I will do this by making classes that exist between $10,000 intervals. This will look like
0-10,000, 10,000-20,000, 20,000-30,000, etc.

In [5]:
print(max(df["Total Price"]))
print(min(df["Total Price"]))

76947.0
16700.0


We can see from above that the max price of a school is 76,947 and the minimum is 16,700. Therefore, I will set the boundaries
starting at 10,0000-20,000 and ending at 70,000-80,000

In [7]:
classified_prices = []
for i in range(len(df["Total Price"])):
    if 10000 <= df["Total Price"].iloc[i] < 20000:
        classified_prices.append("10,000-20,000")
    elif 20000 <= df["Total Price"].iloc[i] < 30000:
        classified_prices.append("20,000-30,000")
    elif 30000 <= df["Total Price"].iloc[i] < 40000:
        classified_prices.append("30,000-40,000")
    elif 40000 <= df["Total Price"].iloc[i] < 50000:
        classified_prices.append("40,000-50,000")
    elif 50000 <= df["Total Price"].iloc[i] < 60000:
        classified_prices.append("50,000-60,000")
    elif 60000 <= df["Total Price"].iloc[i] < 70000:
        classified_prices.append("60,000-70,000")
    elif 70000 <= df["Total Price"].iloc[i] < 80000:
        classified_prices.append("70,000-80,000")

if len(classified_prices) == len(df["Total Price"]):
    classified_target = pd.DataFrame(classified_prices, columns=["Total Price"])
    print(classified_target.head())
else:
    print("Error in classifying")

     Total Price
0  50,000-60,000
1  50,000-60,000
2  60,000-70,000
3  60,000-70,000
4  20,000-30,000


Now, I will separate the datasets, with one having a continuous target, and the other having a discreet target.

In [16]:
df_continuous = df.copy(deep=True)

df_discreet = df.drop(["Total Price"], axis=1)
df_discreet = pd.concat((df_discreet, classified_target), axis=1)

df_discreet.drop(["Unnamed: 0"], axis=1, inplace=True)
df_continuous.drop(["Unnamed: 0"], axis=1, inplace=True)


In [17]:
df_discreet.head()

Unnamed: 0,Number of Bachelor's Degrees,Percent Financial Aid,Average Amount of Aid,Retention Rate,Enrollment,Percent Women,Percent In State,Percent Out of State,Percent Foreign,Percent Unknown,Graduation Rate,Percent Awarded,Total Staff,Instructional Staff,SA Staff,Librarian Staff,Percent Books,Percent Digital,Percent Admitted,Total Price
0,208.0,100.0,32400.0,79.0,996,99.0,59.0,36.0,4.0,0.0,69.0,66.0,357.0,105.0,56.0,62.0,41,12,70.0,"50,000-60,000"
1,310.0,100.0,40855.0,75.0,1533,54.0,66.0,32.0,1.0,0.0,64.0,61.0,435.0,132.0,21.0,27.0,37,54,68.0,"50,000-60,000"
2,398.0,100.0,39796.0,68.0,1912,60.0,53.0,46.0,1.0,0.0,51.0,48.0,355.0,123.0,17.0,21.0,28,13,62.0,"60,000-70,000"
3,382.0,100.0,38689.0,82.0,1771,56.0,50.0,45.0,4.0,0.0,74.0,70.0,426.0,160.0,41.0,50.0,27,46,64.0,"60,000-70,000"
4,61.0,97.0,10055.0,37.0,698,45.0,64.0,34.0,0.0,2.0,31.0,10.0,115.0,41.0,4.0,7.0,20,76,64.0,"20,000-30,000"


In [18]:
df_continuous.head()

Unnamed: 0,Number of Bachelor's Degrees,Percent Financial Aid,Average Amount of Aid,Retention Rate,Enrollment,Percent Women,Percent In State,Percent Out of State,Percent Foreign,Percent Unknown,Graduation Rate,Percent Awarded,Total Staff,Instructional Staff,SA Staff,Librarian Staff,Percent Books,Percent Digital,Percent Admitted,Total Price
0,208.0,100.0,32400.0,79.0,996,99.0,59.0,36.0,4.0,0.0,69.0,66.0,357.0,105.0,56.0,62.0,41,12,70.0,55625.0
1,310.0,100.0,40855.0,75.0,1533,54.0,66.0,32.0,1.0,0.0,64.0,61.0,435.0,132.0,21.0,27.0,37,54,68.0,59470.0
2,398.0,100.0,39796.0,68.0,1912,60.0,53.0,46.0,1.0,0.0,51.0,48.0,355.0,123.0,17.0,21.0,28,13,62.0,60636.0
3,382.0,100.0,38689.0,82.0,1771,56.0,50.0,45.0,4.0,0.0,74.0,70.0,426.0,160.0,41.0,50.0,27,46,64.0,63180.0
4,61.0,97.0,10055.0,37.0,698,45.0,64.0,34.0,0.0,2.0,31.0,10.0,115.0,41.0,4.0,7.0,20,76,64.0,23170.0
