In [None]:
# PREPROCESSING

# Load joined data.

# Dimensionality reduction (removing columns).
    # Missing values ratio.

# Manual dimension reduction (dependent columns).
    # Reduce three gender columns to (all) and a (pct women).
    # Reduce Apps (all) / Offers (all) / Freshmen (all) to Offer Rate (all) and Yield Rate (all).
    # Drop redundant columns from dataframe.

# Validate data:
    # Find columns that should be pct_cols.
        # Check if pct_cols > 1 or < 0.
        # "Handle" this invalid data (clipping? removal?)
    # Define range_cols and that should be restricted to a range and ranges:
        # Check if range_cols in ranges.
        # "Handle" invalid data (clipping? removal?)
        
# Normalize continuous data.
    # Box-Cox transform all cols.
    # Normalize all columns (range 0 - 1).

# One-hot encode categorical data.

# Further dimensionality reduction: 
    # Low-variance filter.
    # High-correlation filter.
        # msno.heatmap()
    
# Impute missing values on remaining columns.
    # MICE for missing continuous values using linear regression plus errors.
    # kNN (?) for missing categorical values.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collegedata_names import num_col_ranges
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

JOINED_CSV_PATH = 'data/joined.csv'
NULL_THRESHOLD = 0.7
VARIANCE_THRESHOLD = 0.01
CORRELATION_THRESHOLD = 0.9
CATEGORY_THRESHOLD = 10

df = pd.read_csv(JOINED_CSV_PATH, index_col = 'SchoolId')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 256 entries, Name to Zip
dtypes: float64(158), object(98)
memory usage: 4.0+ MB


In [3]:
num_cols = df.select_dtypes('float').columns
null_freqs = df[num_cols].isna().sum() / len(df)
null_cols = num_cols.where(null_freqs > NULL_THRESHOLD).dropna()

df = df.drop(columns = null_cols)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 231 entries, Name to Zip
dtypes: float64(133), object(98)
memory usage: 3.6+ MB


In [4]:
df['Applications (women pct of all)'] = \
    df['Applications (women)'] / df['Applications (all)']
    
df['Offers (women pct of all)'] = \
    df['Offers (women)'] / df['Offers (all)']

df['Freshmen Enrolled (women pct of all)'] = \
    df['Freshmen Enrolled (women)'] / df['Freshmen Enrolled (all)']
    
df['Students (all)'] = \
    df['All Undergraduates'] + df['All Graduate Students']

df['Students (undergraduate pct of all)'] = \
    df['All Undergraduates'] / df['Students (all)']
    
df['Students (full-time pct of all undergrads)'] = \
    df['Full-Time Undergraduates'] / df['All Undergraduates']
    
drop_cols = ['Applications (women)', 'Applications (men)', 'Offers (all)',
             'Offers (women)', 'Offers (men)', 'Freshmen Enrolled (all)',
             'Freshmen Enrolled (women)', 'Freshmen Enrolled (men)',
             'Offer Rate (men)', 'Offer Rate (women)', 'Yield Rate (men)',
             'Yield Rate (women)', 'Undergraduates (women)',
             'Undergraduates (men)', 'Undergraduates (men pct of all)',
             'All Undergraduates', 'All Graduate Students',
             'Full-Time Undergraduates']

df = df.drop(columns = drop_cols)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 219 entries, Name to Students (full-time pct of all undergrads)
dtypes: float64(121), object(98)
memory usage: 3.4+ MB


In [5]:
num_cols = df.select_dtypes('float').columns
pct_cols = num_cols[df[num_cols].mean().between(0, 1)]

invalid_vals_df = pd.DataFrame(index = df.index, columns = df.columns)
invalid_vals_df[pct_cols] = (df[pct_cols] < 0) | (df[pct_cols] > 1)


for col, col_range in num_col_ranges.items():
    low, high = col_range
    in_range = df[col].between(low, high)
    has_vals = df[col].notna()
    invalid_vals_df[col] = has_vals & ~in_range
    
invalid_vals_df = invalid_vals_df.fillna(False)
df = df.mask(invalid_vals_df)

invalid_vals_df.sum().sum()

33

In [6]:
pt = PowerTransformer()
scaler = MinMaxScaler()

df[num_cols] = pt.fit_transform(df[num_cols])
df[num_cols] = scaler.fit_transform(df[num_cols])

In [7]:
low_var_cols = num_cols[df[num_cols].var() < VARIANCE_THRESHOLD]
df = df.drop(columns = low_var_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 206 entries, Name to Students (full-time pct of all undergrads)
dtypes: float64(108), object(98)
memory usage: 3.2+ MB


In [8]:
num_cols = df.select_dtypes('float').columns
corr_df = df[num_cols].corr().abs() > CORRELATION_THRESHOLD
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
corr_df = corr_df.mask(mask, False)
corr_pairs = [(corr_df.columns[x], corr_df.index[y]) \
                                          for x, y in zip(*np.where(corr_df))]

to_keep = []
to_drop = []
for (x, y) in corr_pairs:
    if x not in to_keep:
        to_keep.append(x)
    if y not in to_keep:
        to_keep.append(y)

for (x, y) in corr_pairs:
    if x in to_keep and y in to_keep:
        to_keep.remove(y)
        to_drop.append(y)
        
df = df.drop(columns = to_drop)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 168 entries, Name to Students (full-time pct of all undergrads)
dtypes: float64(70), object(98)
memory usage: 2.6+ MB


In [9]:
# Convert remaining cols with low number of unique vals to categorical cols.
for col in df.select_dtypes('object').columns:
    if df[col].nunique() < CATEGORY_THRESHOLD:
        df[col] = df[col].fillna('None')
        df[col] = df[col].astype('category')
        
cat_cols = df.select_dtypes('category').columns
        
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 168 entries, Name to Students (full-time pct of all undergrads)
dtypes: category(51), float64(70), object(47)
memory usage: 1.9+ MB


In [10]:
df = df.join(pd.get_dummies(df[cat_cols]))
df = df.drop(columns = cat_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 286 entries, Name to Work-Study Programs_Other work study available
dtypes: float64(70), object(47), uint8(169)
memory usage: 2.2+ MB


In [11]:
df = df.drop(columns = df.select_dtypes('object'))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 239 entries, 2016 Graduates Who Took Out Loans to Work-Study Programs_Other work study available
dtypes: float64(70), uint8(169)
memory usage: 1.5 MB


In [44]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

y = df['Yield Rate (all)']

num_cols = df.select_dtypes('float').columns

df[num_cols] = imp_mean.fit_transform(df[num_cols])

num_cols = num_cols.drop('Yield Rate (all)')

regressor = SVR(kernel = 'linear', gamma = 'scale')
selector = RFE(regressor)
selector = selector.fit(df[num_cols], df['Yield Rate (all)'])

keep_cols = num_cols[selector.support_]
drop_cols = [col for col in num_cols if col not in keep_cols]
df = df.drop(columns = drop_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 659 to 3379
Columns: 204 entries, Application Fee to Work-Study Programs_Other work study available
dtypes: float64(35), uint8(169)
memory usage: 985.1 KB
