In [1]:
import numpy as np
import pandas as pd


# Import the scraped data.
PATH = 'data/test.csv'
na_vals = ['Not reported','Not Reported']
df = pd.read_csv(PATH, index_col = 'SchoolId', na_values = na_vals)

# Print a status update.
remaining_cols = df.select_dtypes('object').columns.tolist()
print("{} columns need to be cleaned.\n".format(len(remaining_cols)))
df.info()

180 columns need to be cleaned.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 190 entries, Name to Disciplines Pursued
dtypes: float64(10), object(180)
memory usage: 3.0+ MB


In [2]:
# Drop columns with no scraped data.
mask = df.notnull().sum() == 0
cols_to_drop = df.columns[mask]
df = df.drop(columns = cols_to_drop)


# Split the 'City, State, Zip' column into three separate columns.
#regex = '^(.*)([A-Z]{2}).*(\d{5})'
#df[['City', 'State', 'Zip']] = df['City, State, Zip'].str.extract(regex)
#df = df.drop(columns='City, State, Zip')
#remaining_cols.remove('City, State, Zip')


# Mark columns known to be strings as dates.
string_cols = ['Name','Phone', 'Fax', 'Web Site', 'City, State, Zip',
               'Address', 'E-mail', 'Nearest Metropolitan Area', 'Mascot',
               'Activities and Organizations', 'Other', 'Disciplines Pursued']
date_cols = ['Regular Admission Deadline', 'Regular Admission Notification',
             'Accept Offer of Admission', 'Early Decision Deadline', 
             'Early Decision Notification', 'Early Action Deadline', 
             'Early Action Notification', 'Application Deadline',
             'Award Notification']

# Mark remaining columns to be cleaned.
for string_col in string_cols:
    if string_col in remaining_cols:
        remaining_cols.remove(string_col)
for date_col in date_cols:
    if date_col in remaining_cols:
        remaining_cols.remove(date_col)

# Function to convert a regex matched percent number into a decimal number.
def convert_percent(match):
    value = float(match.group(1))
    value /= 100
    value = str(value)[:5]
    return value

# Remove all commas and try to convert remaining cols to numeric.
cols = remaining_cols.copy()
for col in cols:
    df[col] = df[col].str.replace(',', '')
    df[col] = df[col].str.replace('$', '')
    df[col] = df[col].str.replace('([\d\.]+)%', convert_percent)
    try:
        df[col] = pd.to_numeric(df[col])
        remaining_cols.remove(col)
    except:
        pass

    
# Convert remaining cols with low number of unique vals to categorical cols.
cols = remaining_cols.copy()
for col in cols:
    if df[col].nunique() < 30:
        df[col] = df[col].astype('category')
        remaining_cols.remove(col)

        
# Print a status update.
print("{} columns need to be cleaned.\n".format(len(remaining_cols)))
df.info()

60 columns need to be cleaned.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: category(58), float64(44), object(81)
memory usage: 2.1+ MB


In [3]:
def num_count_split(s):
    s_list = []
    if s.any():
        regex = '(\d+\.*\d*)'
        counts = s.str.count(regex)
        highest_count = int(counts.max())

        for n in range(0, highest_count + 1):
            s_n = s[counts == n]
            s_n.name = s.name + ' - ' + str(n) + '/' + str(highest_count)
            s_list.append(s_n)
    return s_list


def mode_string_split(s_input, threshold = 0.9):
    # Default outputs.
    s_mode = pd.Series(None)
    s_other = s_input
    
    # Copy input series, replace numeric vals with '#', and trim whitespace.
    s = s_input.copy()
    s = s.str.replace('(\d+\.*\d*)', '#')
    s = s.str.replace('\s+', ' ')
    s = s.str.strip()
    
    # If the mode string frequency is more than threshold, update the outputs.
    if s.any():
        mode_string = s.value_counts().index[0]
        mode_string_count = s.value_counts()[0]
        mode_string_freq = mode_string_count / s.count()
        if mode_string_freq > threshold:
            s_mode = s_input.mask(s != mode_string)
            s_other = s_input.mask(s == mode_string)
            
    s_mode.name = s_input.name + ' (mode)'
    s_other.name = s_input.name + ' (other)'

    return [s_mode, s_other]


def split_col(input_s):
    s_list = [mode_string_split(s) for s in num_count_split(input_s)]

    # Create a "default" series.
    s_mode_list = [s[0] for s in s_list]
    s_mode_counts = [s.count() for s in s_mode_list]
    s_mode_max = int(max(s_mode_counts))
    n = s_mode_counts.index(s_mode_max)
    default = pd.Series(index = input_s.index)
    default.name = input_s.name
    default.update(s_mode_list[n])
    s_mode_list[n] = pd.Series() # replace master with empty series.

    # Create "secondary" series, as needed.
    secondary_list = []
    for i, s in enumerate(s_mode_list):
        secondary = pd.Series(index = input_s.index)
        secondary.name = input_s.name + ' (secondary ' + str(i) + ')'
        if not s.empty:
            secondary.update(s)
            secondary_list.append(secondary)
    
    # If there were no secondary series, add an empty one.
    if not secondary_list:
        secondary = pd.Series(index = input_s.index)
        secondary_list.append(secondary)

    # Create a single "other" series with differing strings surrounding vals.
    s_other_list = [s[1] for s in s_list]
    other = pd.Series(index = input_s.index)
    other.name = input_s.name + ' (other)'
    for s in s_other_list:
        other.update(s)
        
    return default, secondary_list, other
    

def extract_cols(s):
    extracted_df = pd.DataFrame(index = s.index)
    if s.any():
        regex = '(\d+\.*\d*)'
        extracted_df = s.str.extractall(regex).unstack()
        for col in extracted_df.columns:
            extracted_df[col] = pd.to_numeric(extracted_df[col])
        n = len(extracted_df.columns)
        extracted_df.columns = \
            [s.name + ' - ' + str(i) + '/' + str(n) for i in range(1, n + 1)]
    return extracted_df

# Split series 's' into new series based on constancy of string surrounding 
# extracted values.  Series 's' is first split into k separate series by the 
# quantity of numeric values found. Then, each of these new series is again 
# split into two series, the first which contains only records containing the 
# mode of the string surrounding the numeric values (at least 90%). The second
# series contains all other records. 
# 
# The 'default' series is the 'mode' series with the highest counts. The 
# secondary series, if they exist, are inside 'secondary_list' and are other
# 'mode' series. All other records are combined into a single 'other' series.

split_cols = []
cols = remaining_cols.copy()
for col in cols:
    # Split series.
    default, secondary_list, other = split_col(df[col])
    
    # Print status update.
    #print('Col: {}'.format(col))
    #print('Default count: {}'.format(default.count()))
    #print('Secondary counts: {}'.format([s.count() for s in secondary_list]))
    #print('Other count: {}'.format(other.count()))
    #print('Total count: {}\n'.format(df[col].count()))

    # Join default.
    all_extracted_df = pd.DataFrame(index = df.index)
    default_extracted_df = extract_cols(default)
    all_extracted_df = all_extracted_df.join(default_extracted_df)
    
    if not default_extracted_df.empty:
        if len(default_extracted_df.columns) == 1:
            default_extracted_df.columns = [col]
        else:
            for default_col in default_extracted_df.columns:
                split_cols.append(default_col)

    # Join secondaries.
    for secondary in secondary_list:
        secondary_extracted_df = extract_cols(secondary)
        all_extracted_df = all_extracted_df.join(secondary_extracted_df)
        
        if not secondary_extracted_df.empty:
            for secondary_col in secondary_extracted_df.columns:
                split_cols.append(secondary_col)
    
    # Join other.
    if not default.any():
        string_cols.append(col)
    elif other.any():
        remaining_cols.append(other.name)
    
    all_extracted_df = all_extracted_df.join(other)

    # Join to df and remove original column from df.
    if default.any() and (len(default_extracted_df.columns) != 1):
        df = df.drop(columns=col)
    df = df.join(all_extracted_df)
    
    remaining_cols.remove(col)

        
# Print a status update.
print("{} columns need to be cleaned.".format(len(remaining_cols)))
print("{} columns need to be re-labeled and/or merged.\n".format(len(split_cols)))
df.info()

15 columns need to be cleaned.
383 columns need to be re-labeled and/or merged.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 600 entries, Name to Average Starting Salary (other)
dtypes: category(58), float64(465), object(77)
memory usage: 8.6+ MB


In [10]:
# Dicts are leftover from previous version...

rename_dict = {
    'Overall Admission Rate (women) (unusual)':'Offers (women)',
    'Overall Admission Rate (men) (unusual)':'Offers (men)',
    'Found to Have Financial Need (Freshmen) (unusual)': \
        'FinAid Need (freshmen)',
    'Received Financial Aid (Freshmen) (unusual)': \
        'FinAid Received (freshmen)',
    'Need Fully Met (Freshmen) (unusual)':'FinAid Fully Met (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) (unusual)': \
        'FinAid Merit Gift Students (freshmen)',
    'Financial Aid Applicants (All Undergraduates) (unusual)':'FinAid Apps',
    'Found to Have Financial Need (All Undergraduates) (unusual)': \
        'FinAid Need',
    'Received Financial Aid (All Undergraduates) (unusual)':'FinAid Received',
    'Need Fully Met (All Undergraduates) (unusual)':'FinAid Fully Met',
    'Average Award - Merit-Based Gift (All Undergraduates) (unusual)': \
        'FinAid Merit Gift Students'
}

regex_dict = {
    'High School Class Rank (unusual)':{
        'HS Rank Top 10th':'Top tenth:\s*([\d\.]+)',
        'HS Rank Top 25th':'Top quarter:\s*([\d\.]+)',
        'HS Rank Top 50th':'Top half:\s*([\d\.]+)'},
    'Average Award - Need-Based Gift (Freshmen) (unusual)':{
        'FinAid Need Gift Students (freshmen)':'Received by ([\d]+)',
        'FinAid Need Gift Pct of FinAid Students (freshmen)':'\(([\d\.]+)\)',
        'FinAid Need Gift Avg Amount (freshmen)':'amount (\d+)'},
    'Average Award - Need-Based Self-Help (Freshmen) (unusual)':{
        'FinAid Need Self-Help Students (freshmen)':'Received by ([\d]+)',
        'FinAid Need Self-Help Pct of FinAid Students (freshmen)':'\(([\d\.]+)\)',
        'FinAid Need Self-Help Avg Amount (freshmen)':'amount (\d+)'},
    'Merit-Based Gift (Freshmen) (unusual)':{
        'Non-FinAid Merit Gift Students (freshmen)':'^(\d+)',
        'Non-FinAid Merit Gift Pct of Students (freshmen)':'\(([\d\.]+)\)',
        'Non-FinAid Merit Gift Avg Amount (freshmen)':'amount (\d+)'},
    'Average Award - Need-Based Gift (All Undergraduates) (unusual)':{
        'FinAid Need Gift Students':'Received by ([\d]+)',
        'FinAid Need Gift Pct of FinAid Students':'\(([\d\.]+)\)',
        'FinAid Need Gift Avg Amount':'amount (\d+)'},
    'Average Award - Need-Based Self-Help (All Undergraduates) (unusual)':{
        'FinAid Need Self-Help Students':'Received by ([\d]+)',
        'FinAid Need Self-Help Pct of FinAid Students':'\(([\d\.]+)\)',
        'FinAid Need Self-Help Avg Amount':'amount (\d+)'},
    'Merit-Based Gift (All Undergraduates) (unusual)':{
        'Non-FinAid Merit Gift Students':'^(\d+)',
        'Non-FinAid Merit Gift Pct of Students':'\(([\d\.]+)\)',
        'Non-FinAid Merit Gift Avg Amount':'amount (\d+)'},
    'Nearest Airport (unusual)':{
        'Nearest Airport (city)':'^(\D+)$'}
}

rename_dict = {
    'Undergrads (women) - 0':'Undergrads (women)',
    'Undergrads (women) - 1':'Undergrads (percent women)',
    'Undergrads (men) - 0':'Undergrads (men)',
    'Undergrads (men) - 1':'Undergrads (percent men)',
    'Overall Admission Rate - 0':'Offer Rate',
    'Overall Admission Rate - 1':'Apps',
    'Overall Admission Rate (women) - 0':'Offer Rate (women)',
    'Overall Admission Rate (women) - 1':'Apps (women)',
    'Overall Admission Rate (men) - 0':'Offer Rate (men)',
    'Overall Admission Rate (men) - 1':'Apps (men)',
    'Early Decision Admission Rate - 0':'ED Offer Rate',
    'Early Decision Admission Rate - 1':'ED Apps',
    'Early Action Admission Rate - 0':'EA Offer Rate',
    'Early Action Admission Rate - 1':'EA Apps',
    'High School Class Rank - 0':'HS Rank Top 10th',
    'High School Class Rank - 1':'HS Rank Top 25th',
    'High School Class Rank - 2':'HS Rank Top 50th',
    'Financial Aid Applicants (Freshmen) - 0':'FinAid Apps (freshmen)',
    'Financial Aid Applicants (Freshmen) - 1':'FinAid App Pct (freshmen)',
    'Found to Have Financial Need (Freshmen) - 0':'FinAid Need (freshmen)',
    'Found to Have Financial Need (Freshmen) - 1': \
        'FinAid Need Pct (freshmen)',
    'Received Financial Aid (Freshmen) - 0':'FinAid Received (freshmen)',
    'Received Financial Aid (Freshmen) - 1':'FinAid Received Pct (freshmen)',
    'Need Fully Met (Freshmen) - 0':'FinAid Fully Met (freshmen)',
    'Need Fully Met (Freshmen) - 1':'FinAid Fully Met Pct (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 0': \
        'FinAid Need Gift Students (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 1': \
        'FinAid Need Gift Pct of FinAid Students (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 2': \
        'FinAid Need Gift Avg Amount (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 0': \
        'FinAid Need Self-Help Students (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 1': \
        'FinAid Need Self-Help Pct of FinAid Students (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 2': \
        'FinAid Need Self-Help Avg Amount (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) - 0': \
        'FinAid Merit Gift Students (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) - 1': \
        'FinAid Merit Gift Pct of FinAid Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 0': \
        'Non-FinAid Merit Gift Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 1': \
        'Non-FinAid Merit Gift Pct of Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 2': \
        'Non-FinAid Merit Gift Avg Amount (freshmen)',
    'Financial Aid Applicants (All Undergraduates) - 0':'FinAid Apps',
    'Financial Aid Applicants (All Undergraduates) - 1':'FinAid App Pct',
    'Found to Have Financial Need (All Undergraduates) - 0':'FinAid Need',
    'Found to Have Financial Need (All Undergraduates) - 1': \
        'FinAid Need Pct',
    'Received Financial Aid (All Undergraduates) - 0':'FinAid Received',
    'Received Financial Aid (All Undergraduates) - 1':'FinAid Received Pct',
    'Need Fully Met (All Undergraduates) - 0':'FinAid Fully Met',
    'Need Fully Met (All Undergraduates) - 1':'FinAid Fully Met Pct',
    'Average Award - Need-Based Gift (All Undergraduates) - 0': \
        'FinAid Need Gift Students',
    'Average Award - Need-Based Gift (All Undergraduates) - 1': \
        'FinAid Need Gift Pct of FinAid Students',
    'Average Award - Need-Based Gift (All Undergraduates) - 2': \
        'FinAid Need Gift Avg Amount',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 0': \
        'FinAid Need Self-Help Students',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 1': \
        'FinAid Need Self-Help Pct of FinAid Students',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 2': \
        'FinAid Need Self-Help Avg Amount',
    'Average Award - Merit-Based Gift (All Undergraduates) - 0': \
        'FinAid Merit Gift Students',
    'Average Award - Merit-Based Gift (All Undergraduates) - 1': \
        'FinAid Merit Gift Pct of FinAid Students',
    'Merit-Based Gift (All Undergraduates) - 0': \
        'Non-FinAid Merit Gift Students',
    'Merit-Based Gift (All Undergraduates) - 1': \
        'Non-FinAid Merit Gift Pct of Students',
    'Merit-Based Gift (All Undergraduates) - 2': \
        'Non-FinAid Merit Gift Avg Amount',
    'Temperature - 0':'Temperature - Avg Jan Low',
    'Temperature - 1':'Temperature - Avg Sep High'
}