In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Seed random number generator
np.random.seed(16378429)

In [3]:
# Load numerical data
numeric_df = pd.read_csv('rmpCapstoneNum.csv', header=None)

# Assign column names
numeric_df.columns = [
    'Average_Rating',
    'Average_Difficulty',
    'Number_of_Ratings',
    'Has_Pepper',
    'Would_Take_Again_Proportion',
    'Online_Class_Ratings',
    'Male',
    'Female'
]

# Load qualitative data
qual_df = pd.read_csv('rmpCapstoneQual.csv', header=None)

# Assign column names
qual_df.columns = [
    'Field',
    'University',
    'State'
]

# Merge dataframes
df = pd.concat([numeric_df, qual_df], axis=1)

# Check df
print(df.shape)
df.head()

(89893, 11)


Unnamed: 0,Average_Rating,Average_Difficulty,Number_of_Ratings,Has_Pepper,Would_Take_Again_Proportion,Online_Class_Ratings,Male,Female,Field,University,State
0,5.0,1.5,2.0,0.0,,0.0,0,1,Criminal Justice,George Mason University,VA
1,,,,,,,0,0,,,
2,3.2,3.0,4.0,0.0,,0.0,1,0,English,Alabama State University,AL
3,3.6,3.5,10.0,1.0,,0.0,0,0,English,University of Kentucky,KY
4,1.0,5.0,1.0,0.0,,0.0,0,0,English,Keiser University,FL


In [4]:
# Check data
print("Missing values per column\n", df.isnull().sum())

print("\nBasic stats for numerical cols:\n", df.describe())

print("\nData types:\n", df.dtypes)


Missing values per column
 Average_Rating                 19889
Average_Difficulty             19889
Number_of_Ratings              19889
Has_Pepper                     19889
Would_Take_Again_Proportion    77733
Online_Class_Ratings           19889
Male                               0
Female                             0
Field                          19889
University                     19889
State                          19889
dtype: int64

Basic stats for numerical cols:
        Average_Rating  Average_Difficulty  Number_of_Ratings    Has_Pepper  \
count    70004.000000        70004.000000       70004.000000  70004.000000   
mean         3.808014            2.864623           5.374721      0.279927   
std          1.126895            0.991064           8.136628      0.448966   
min          1.000000            1.000000           1.000000      0.000000   
25%          3.000000            2.000000           1.000000      0.000000   
50%          4.000000            3.000000          

In [5]:
# Convert booleans
boolean_columns = ['Has_Pepper', 'Male', 'Female']
for col in boolean_columns:
    if df[col].dtype != bool:
        df[col] = df[col].astype(bool)

In [6]:
# TEMP


# Basic preprocessing - handle missing values

# For numerical columns, fill with median
# Conservative approach for bools, replaces missing with False
for col in numeric_df.columns:
    if df[col].isnull().sum() > 0:
        if col in boolean_columns:
            df[col].fillna(False, inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

# For categorical columns, fill with mode
for col in qual_df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Check if we've handled all missing values
print("\nRemaining missing values after preprocessing:")
print(df.isnull().sum())


Remaining missing values after preprocessing:
Average_Rating                 0
Average_Difficulty             0
Number_of_Ratings              0
Has_Pepper                     0
Would_Take_Again_Proportion    0
Online_Class_Ratings           0
Male                           0
Female                         0
Field                          0
University                     0
State                          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
