In [67]:
import numpy as np
import pandas as pd

PATH = 'data/test.csv'

na_vals = ['Not reported','Not Reported']

df = pd.read_csv(PATH, index_col = 'SchoolId', na_values = na_vals)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 190 entries, Name to Disciplines Pursued
dtypes: float64(10), object(180)
memory usage: 3.0+ MB


In [68]:
# Drop columns with no scraped data.
mask = df.notnull().sum() == 0
cols_to_drop = df.columns[mask]
df = df.drop(columns = cols_to_drop)


# Split the 'City, State, Zip' column into three separate columns.
regex = '^(.*)([A-Z]{2}).*(\d{5})'
df[['City', 'State', 'Zip']] = df['City, State, Zip'].str.extract(regex)
df = df.drop(columns='City, State, Zip')


# Mark columns known to be strings as dates.
string_cols = ['Phone', 'Fax', 'Web Site','City',
               'State', 'Address', 'E-mail', 'Nearest Metropolitan Area',
               'Mascot']
date_cols = ['Regular Admission Deadline', 'Regular Admission Notification',
             'Accept Offer of Admission', 'Early Decision Deadline', 
             'Early Decision Notification', 'Early Action Deadline', 
             'Early Action Notification', 'Application Deadline',
             'Award Notification']

# Mark remaining columns to be cleaned.
remaining_cols = df.select_dtypes('object').columns.tolist()
for string_col in string_cols:
    remaining_cols.remove(string_col)
for date_col in date_cols:
    remaining_cols.remove(date_col)

# Function to convert a regex matched percent number into a decimal number.
def convert_percent(match):
    value = float(match.group(1))
    value /= 100
    value = str(value)[:5]
    return value

# Remove all commas and try to convert remaining cols to numeric.
cols = remaining_cols.copy()
for col in cols:
    df[col] = df[col].str.replace(',', '')
    df[col] = df[col].str.replace('$', '')
    df[col] = df[col].str.replace('([\d\.]+)%', convert_percent)
    try:
        df[col] = pd.to_numeric(df[col])
        remaining_cols.remove(col)
    except:
        pass

    
# Convert remaining cols with low number of unique vals to categorical cols.
cols = remaining_cols.copy()
for col in cols:
    if df[col].nunique() < 30:
        df[col] = df[col].astype('category')
        remaining_cols.remove(col)

        
# Print a status update.
print("{} columns need to be cleaned.\n".format(len(remaining_cols)))
df.info()

64 columns need to be cleaned.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 185 entries, Name to Zip
dtypes: category(58), float64(45), object(82)
memory usage: 2.1+ MB


In [76]:
def amount_of_numeric_values(series):
    regex = '\d+\.*\d*'
    num_count = series.str.count(regex)
    if num_count.nunique() == 1:
        print(num_count.unique())
        return int(num_count.unique()[-1])
    else:
        return None
    
    
def surrounding_text_constant(series):
    regex = '(\d+\.*\d*)'
    if series.str.replace(regex, '').str.replace('\s+',' ').nunique() == 1:
        return True
    else:
        return None
    
    
def extract_vals(df, col):
    regex = '(\d+\.*\d*)'
    extracted_df = df[col].str.extractall(regex).unstack()
    extracted_df.columns = extracted_df.columns.droplevel(0)
    extracted_cols = extracted_df.columns.tolist()

    df = df.drop(columns=col)
    
    for i, extracted_col in enumerate(extracted_cols):
        label = col
        if len(extracted_cols) > 1:
            label += ' - ' + str(i)

        df[label] = extracted_df[extracted_col]
        df[label] = pd.to_numeric(df[label])
        
    return df


# For remaining cols
cols = remaining_cols.copy()
for col in cols:
    n = amount_of_numeric_values(df[col])
    if n and n == 0:
        string_cols.append(col)
        remaining_cols.remove(col)
    elif n and surrounding_text_constant(df[col]):
        df = extract_vals(df, col)
        remaining_cols.remove(col)
    
# Print a status update.
print("{} columns need to be cleaned.\n".format(len(remaining_cols)))
df.info()

[0]
[nan  1.]
[nan  1.]
[nan  2.]
[nan  2.]
[ 2. nan]


ValueError: cannot convert float NaN to integer

In [70]:
remaining_cols

['Name',
 'Undergrads (women)',
 'Undergrads (men)',
 'Overall Admission Rate',
 'SAT Math',
 'SAT Critical Reading',
 'ACT Composite',
 'Cost of Attendance',
 'Tuition and Fees',
 'Regular Class Size',
 'Students in College Housing',
 'Sororities',
 'Fraternities',
 'Ethnicity of Students from U.S.',
 'International Students',
 'Application Fee',
 'Other',
 'Overall Admission Rate (women)',
 'Overall Admission Rate (men)',
 'Students Enrolled',
 'Students Enrolled (women)',
 'Students Enrolled (men)',
 'Early Decision Admission Rate',
 'Early Action Admission Rate',
 'High School Class Rank',
 'Financial Aid Applicants (Freshmen)',
 'Found to Have Financial Need (Freshmen)',
 'Received Financial Aid (Freshmen)',
 'Need Fully Met (Freshmen)',
 'Average Award - Need-Based Gift (Freshmen)',
 'Average Award - Need-Based Self-Help (Freshmen)',
 'Average Award - Merit-Based Gift (Freshmen)',
 'Merit-Based Gift (Freshmen)',
 'Financial Aid Applicants (All Undergraduates)',
 'Found to Have Fi

In [37]:
# For each remaining col, find the frequency of the num val counts.
freqs = {}
cols = remaining_cols.copy()
for col in cols:
    num_count_freq = []
    regex = '[\d\.]+'
    num_count = df[col].str.count(regex)
    counts = [(num_count == i).sum() for i in range(0, int(num_count.max()) + 1)]
    freqs[col] = [count / df[col].count() for count in counts]
    
    if max(freqs[col]) == 1:
        n_vals = freqs[col].index(1)
        if n_vals == 0:
            string_cols.append(col)
        else:
            if df[col].str.replace(regex,'').nunique() == 1:
                

In [7]:
# For each remaining col, if the max freq is over 95%, move the 'unusual' vals
# with a different num val count into a new separate col.
exact_num_count_cols = []
unusual_num_count_cols = []
remaining_cols = []

for col in cols:
    if max(freqs[col]) == 1:
        if freqs[col].index(max(freqs[col])) == 0:
            true_string_cols.append(col)
        else:
            exact_num_count_cols.append(col)
    elif max(freqs[col]) >= 0.95:
        # Copy vals with different num vals to new df col.
        n = freqs[col].index(max(freqs[col]))
        if n != 0:
            num_count = df[col].str.count('\D*\d+\.*\d*\D*')
            mask = num_count != n
            new_col = col + ' (unusual)'
            df.loc[mask, new_col] = df.loc[mask, col]
            df.loc[mask, col] = np.nan
            unusual_num_count_cols.append(new_col)
            exact_num_count_cols.append(col)
        else:
            true_string_cols.append(col)
    else:
        remaining_cols.append(col)
cols = remaining_cols

In [8]:
# Separating values from the exact num cols

# Check to see if the text surrounding vals are the same
clean_exact_num_cols = []
dirty_exact_num_cols = []
for col in exact_num_count_cols:
    regex = '[\d\.]+'
    temp_col = df[col].str.replace(regex, '').str.replace('\s{2,}',' ')
    if temp_col.nunique() == 1:
        clean_exact_num_cols.append(col)
    else:
        dirty_exact_num_cols.append(col)

In [9]:


# Separate numeric values into their own numeric columns.
for col in clean_exact_num_cols:
    regex = '\D*(\d+\.*\d*)\D*'
    extracted_df = df[col].str.extractall(regex).unstack()
    extracted_df.columns = extracted_df.columns.droplevel(0)
    extracted_cols = extracted_df.columns.tolist()
    
    df = df.drop(columns=col)
    
    for i, extracted_col in enumerate(extracted_cols):
        if len(extracted_cols) > 1:
            label = col + ' - ' + str(i)
        else:
            label = col
        df[label] = extracted_df[extracted_col]
        df[label] = pd.to_numeric(df[label])

In [10]:
# Rename the split num val cols.
rename_dict = {
    'Undergrads (women) - 0':'Undergrads (women)',
    'Undergrads (women) - 1':'Undergrads (percent women)',
    'Undergrads (men) - 0':'Undergrads (men)',
    'Undergrads (men) - 1':'Undergrads (percent men)',
    'Overall Admission Rate - 0':'Offer Rate',
    'Overall Admission Rate - 1':'Apps',
    'Overall Admission Rate (women) - 0':'Offer Rate (women)',
    'Overall Admission Rate (women) - 1':'Apps (women)',
    'Overall Admission Rate (men) - 0':'Offer Rate (men)',
    'Overall Admission Rate (men) - 1':'Apps (men)',
    'Early Decision Admission Rate - 0':'ED Offer Rate',
    'Early Decision Admission Rate - 1':'ED Apps',
    'Early Action Admission Rate - 0':'EA Offer Rate',
    'Early Action Admission Rate - 1':'EA Apps',
    'High School Class Rank - 0':'HS Rank Top 10th',
    'High School Class Rank - 1':'HS Rank Top 25th',
    'High School Class Rank - 2':'HS Rank Top 50th',
    'Financial Aid Applicants (Freshmen) - 0':'FinAid Apps (freshmen)',
    'Financial Aid Applicants (Freshmen) - 1':'FinAid App Pct (freshmen)',
    'Found to Have Financial Need (Freshmen) - 0':'FinAid Need (freshmen)',
    'Found to Have Financial Need (Freshmen) - 1': \
        'FinAid Need Pct (freshmen)',
    'Received Financial Aid (Freshmen) - 0':'FinAid Received (freshmen)',
    'Received Financial Aid (Freshmen) - 1':'FinAid Received Pct (freshmen)',
    'Need Fully Met (Freshmen) - 0':'FinAid Fully Met (freshmen)',
    'Need Fully Met (Freshmen) - 1':'FinAid Fully Met Pct (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 0': \
        'FinAid Need Gift Students (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 1': \
        'FinAid Need Gift Pct of FinAid Students (freshmen)',
    'Average Award - Need-Based Gift (Freshmen) - 2': \
        'FinAid Need Gift Avg Amount (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 0': \
        'FinAid Need Self-Help Students (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 1': \
        'FinAid Need Self-Help Pct of FinAid Students (freshmen)',
    'Average Award - Need-Based Self-Help (Freshmen) - 2': \
        'FinAid Need Self-Help Avg Amount (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) - 0': \
        'FinAid Merit Gift Students (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) - 1': \
        'FinAid Merit Gift Pct of FinAid Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 0': \
        'Non-FinAid Merit Gift Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 1': \
        'Non-FinAid Merit Gift Pct of Students (freshmen)',
    'Merit-Based Gift (Freshmen) - 2': \
        'Non-FinAid Merit Gift Avg Amount (freshmen)',
    'Financial Aid Applicants (All Undergraduates) - 0':'FinAid Apps',
    'Financial Aid Applicants (All Undergraduates) - 1':'FinAid App Pct',
    'Found to Have Financial Need (All Undergraduates) - 0':'FinAid Need',
    'Found to Have Financial Need (All Undergraduates) - 1': \
        'FinAid Need Pct',
    'Received Financial Aid (All Undergraduates) - 0':'FinAid Received',
    'Received Financial Aid (All Undergraduates) - 1':'FinAid Received Pct',
    'Need Fully Met (All Undergraduates) - 0':'FinAid Fully Met',
    'Need Fully Met (All Undergraduates) - 1':'FinAid Fully Met Pct',
    'Average Award - Need-Based Gift (All Undergraduates) - 0': \
        'FinAid Need Gift Students',
    'Average Award - Need-Based Gift (All Undergraduates) - 1': \
        'FinAid Need Gift Pct of FinAid Students',
    'Average Award - Need-Based Gift (All Undergraduates) - 2': \
        'FinAid Need Gift Avg Amount',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 0': \
        'FinAid Need Self-Help Students',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 1': \
        'FinAid Need Self-Help Pct of FinAid Students',
    'Average Award - Need-Based Self-Help (All Undergraduates) - 2': \
        'FinAid Need Self-Help Avg Amount',
    'Average Award - Merit-Based Gift (All Undergraduates) - 0': \
        'FinAid Merit Gift Students',
    'Average Award - Merit-Based Gift (All Undergraduates) - 1': \
        'FinAid Merit Gift Pct of FinAid Students',
    'Merit-Based Gift (All Undergraduates) - 0': \
        'Non-FinAid Merit Gift Students',
    'Merit-Based Gift (All Undergraduates) - 1': \
        'Non-FinAid Merit Gift Pct of Students',
    'Merit-Based Gift (All Undergraduates) - 2': \
        'Non-FinAid Merit Gift Avg Amount',
    'Temperature - 0':'Temperature - Avg Jan Low',
    'Temperature - 1':'Temperature - Avg Sep High'
}

df = df.rename(columns=rename_dict)

In [12]:
# For each remaining col, find the frequency of the num val counts.
freqs = {}
for col in unusual_num_count_cols:
    num_count_freq = []
    num_count = df[col].str.count('\D*\d+\.*\d*\D*')
    counts = [(num_count == i).sum() for i in range(0, int(num_count.max()) + 1)]
    freqs[col] = [count / df[col].count() for count in counts]

In [16]:
# Find the cols with only n num vals.
staging_area = []
remaining = []
for col, freq in freqs.items():
    if max(freq) == 1:
        n = freq.index(max(freq))
        staging_area.append(col)
    else:
        remaining.append(col)

In [19]:
# Find the cols with identical string text after removing num val.
clean_cols = []
dirty_cols = []
for col in staging_area:
    regex = '\D*\d+\.*\d*\D*'
    temp_col = df[col].str.replace(regex, '')
    if temp_col.nunique() == 1:
        clean_cols.append(col)
    else:
        dirty_cols.append(col)
        


[]

In [11]:
rename_dict = {
    'Overall Admission Rate (women) (unusual)':'Offers (women)',
    'Overall Admission Rate (men) (unusual)':'Offers (men)',
    'Found to Have Financial Need (Freshmen) (unusual)': \
        'FinAid Need (freshmen)',
    'Received Financial Aid (Freshmen) (unusual)': \
        'FinAid Received (freshmen)',
    'Need Fully Met (Freshmen) (unusual)':'FinAid Fully Met (freshmen)',
    'Average Award - Merit-Based Gift (Freshmen) (unusual)': \
        'FinAid Merit Gift Students (freshmen)',
    'Financial Aid Applicants (All Undergraduates) (unusual)':'FinAid Apps',
    'Found to Have Financial Need (All Undergraduates) (unusual)': \
        'FinAid Need',
    'Received Financial Aid (All Undergraduates) (unusual)':'FinAid Received',
    'Need Fully Met (All Undergraduates) (unusual)':'FinAid Fully Met',
    'Average Award - Merit-Based Gift (All Undergraduates) (unusual)': \
        'FinAid Merit Gift Students'
}

cols_to_drop = ['Temperature (unusual)', 'Average Starting Salary (unusual)']

# Delete text surrounding num val.
for col in clean_cols:
    regex = '\D*(\d+\.*\d*)\D*'
    df[col] = df[col].str.extract(regex)

# Update the columns with the extracted unusual vals, or create new col if 
# necessary. Finally, drop the unusual cols.
for temp_label, col in rename_dict.items():
    if col in df.columns:
        df[col].update(df[temp_label])
    else:
        df[col] = df[temp_label]
    
    df[col] = pd.to_numeric(df[col])
    df.drop(columns=temp_label)

In [18]:
regex_dict = {
    'High School Class Rank (unusual)':{
        'HS Rank Top 10th':'Top tenth:\s*([\d\.]+)',
        'HS Rank Top 25th':'Top quarter:\s*([\d\.]+)',
        'HS Rank Top 50th':'Top half:\s*([\d\.]+)'},
    'Average Award - Need-Based Gift (Freshmen) (unusual)':{
        'FinAid Need Gift Students (freshmen)':'Received by ([\d]+)',
        'FinAid Need Gift Pct of FinAid Students (freshmen)':'\(([\d\.]+)\)',
        'FinAid Need Gift Avg Amount (freshmen)':'amount (\d+)'},
    'Average Award - Need-Based Self-Help (Freshmen) (unusual)':{
        'FinAid Need Self-Help Students (freshmen)':'Received by ([\d]+)',
        'FinAid Need Self-Help Pct of FinAid Students (freshmen)':'\(([\d\.]+)\)',
        'FinAid Need Self-Help Avg Amount (freshmen)':'amount (\d+)'},
    'Merit-Based Gift (Freshmen) (unusual)':{
        'Non-FinAid Merit Gift Students (freshmen)':'^(\d+)',
        'Non-FinAid Merit Gift Pct of Students (freshmen)':'\(([\d\.]+)\)',
        'Non-FinAid Merit Gift Avg Amount (freshmen)':'amount (\d+)'},
    'Average Award - Need-Based Gift (All Undergraduates) (unusual)':{
        'FinAid Need Gift Students':'Received by ([\d]+)',
        'FinAid Need Gift Pct of FinAid Students':'\(([\d\.]+)\)',
        'FinAid Need Gift Avg Amount':'amount (\d+)'},
    'Average Award - Need-Based Self-Help (All Undergraduates) (unusual)':{
        'FinAid Need Self-Help Students':'Received by ([\d]+)',
        'FinAid Need Self-Help Pct of FinAid Students':'\(([\d\.]+)\)',
        'FinAid Need Self-Help Avg Amount':'amount (\d+)'},
    'Merit-Based Gift (All Undergraduates) (unusual)':{
        'Non-FinAid Merit Gift Students':'^(\d+)',
        'Non-FinAid Merit Gift Pct of Students':'\(([\d\.]+)\)',
        'Non-FinAid Merit Gift Avg Amount':'amount (\d+)'},
    'Nearest Airport (unusual)':{
        'Nearest Airport (city)':'^(\D+)$'}
}

dirty_exact_num_cols

['Campus Size', 'Temperature', 'Rain', 'Nearest Airport']

SchoolId
957    Bismarck ND  58504-9652
Name: City, State, Zip, dtype: object

In [14]:
for col in cols:
    print(col)
    for n, freq in enumerate(freqs[col]):
        print('\t{}: {:.2f}'.format(n, freq))

SAT Math
	0: 0.00
	1: 0.01
	2: 0.00
	3: 0.38
	4: 0.61
SAT Critical Reading
	0: 0.00
	1: 0.01
	2: 0.00
	3: 0.38
	4: 0.61
ACT Composite
	0: 0.00
	1: 0.02
	2: 0.00
	3: 0.26
	4: 0.72
Cost of Attendance
	0: 0.30
	1: 0.46
	2: 0.23
Tuition and Fees
	0: 0.00
	1: 0.67
	2: 0.33
Regular Class Size
	0: 0.00
	1: 0.00
	2: 0.00
	3: 0.01
	4: 0.00
	5: 0.00
	6: 0.01
	7: 0.00
	8: 0.00
	9: 0.04
	10: 0.00
	11: 0.00
	12: 0.10
	13: 0.00
	14: 0.00
	15: 0.14
	16: 0.00
	17: 0.01
	18: 0.30
	19: 0.00
	20: 0.39
Ethnicity of Students from U.S.
	0: 0.00
	1: 0.01
	2: 0.00
	3: 0.00
	4: 0.00
	5: 0.01
	6: 0.01
	7: 0.02
	8: 0.95
International Students
	0: 0.00
	1: 0.19
	2: 0.81
Address
	0: 0.04
	1: 0.83
	2: 0.12
	3: 0.01
City, State, Zip
	0: 0.00
	1: 0.48
	2: 0.52
Phone
	0: 0.00
	1: 0.00
	2: 0.00
	3: 0.92
	4: 0.08
Application Fee
	0: 0.25
	1: 0.75
Other
	0: 0.78
	1: 0.13
	2: 0.06
	3: 0.01
	4: 0.01
	5: 0.00
	6: 0.00
	7: 0.00
	8: 0.00
	9: 0.00
	10: 0.00
	11: 0.00
	12: 0.00
	13: 0.00
	14: 0.00
	15: 0.00
	16: 0.00
Students E

In [53]:
# Investigate the dirty_exact_num_cols to see what can be done.
clean_exact_num_cols += ['Campus Size','Temperature, Rain']

# Manually deal with the nearest airport
df['Nearest Airport (city)'] = df['Nearest Airport'].str.extract(r'in (.*)')
df['Nearest Airport (distance)'] = df['Nearest Airport'].str.extract(r'(\d+)')
df.drop(columns='Nearest Airport')