In [1]:
import numpy as np
import pandas as pd
from collegedata_names import col_rename_dict

# Import the scraped data.
PATH = 'data/test.csv'
na_vals = ['Not reported','Not Reported','Not available']
df = pd.read_csv(PATH, index_col = 'SchoolId', na_values = na_vals)

# Mark all columns imported as 'object' types as still needing to be cleaned.
remaining_cols = df.select_dtypes('object').columns.tolist()

# Print a status update.
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 190 entries, Name to Disciplines Pursued
dtypes: float64(10), object(180)
memory usage: 3.0+ MB

180 columns need to be cleaned.


In [2]:
# Drop columns with no scraped data.
cols_to_drop = [df[col].name for col in df.columns if df[col].dropna().empty]
df.drop(columns = cols_to_drop, inplace = True)

# Remove dropped cols from remaining cols to clean.
remaining_cols = [col for col in remaining_cols if not col in cols_to_drop]

# Print a status update.
print("Dropped the following {} empty columns:".format(len(cols_to_drop)))
for col in cols_to_drop:
    print("\t{}".format(col))
print('')
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

Dropped the following 7 empty columns:
	SAT Writing
	Parents Borrowing PLUS Loans
	Academic Interest/Achievement - Top Areas (By Money Awarded)
	Creative Arts/Performance - Top Areas (By Money Awarded)
	Special Achievements/Activities - Top Areas (By Money Awarded)
	Special Characteristics - Top Areas (By Money Awarded)
	Internet/E-mail Access

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: float64(3), object(180)
memory usage: 2.8+ MB

180 columns need to be cleaned.


In [3]:
# Mark columns known to be strings.
string_cols = ['Name','Phone', 'Fax', 'Web Site', 'City, State, Zip',
               'Address', 'E-mail', 'Nearest Metropolitan Area', 'Mascot',
               'Activities and Organizations', 'Other', 'Disciplines Pursued']

# Mark columns known to be dates.
date_cols = ['Regular Admission Deadline', 'Regular Admission Notification',
             'Accept Offer of Admission', 'Early Decision Deadline', 
             'Early Decision Notification', 'Early Action Deadline', 
             'Early Action Notification', 'Application Deadline',
             'Award Notification']

# Remove strings and dates from remaining cols to clean.
remaining_cols = [col for col in remaining_cols if not col in string_cols]
remaining_cols = [col for col in remaining_cols if not col in date_cols]

# Print a status update.
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: float64(3), object(180)
memory usage: 2.8+ MB

159 columns need to be cleaned.


In [4]:
# Delete commas and dollar signs from all values in the dataframe.
df.replace('[,\$]', '', regex = True, inplace = True)

# Convert percents to decimals, and attempt convert cols to numeric type.
for col in remaining_cols:
    repl = lambda m: str(float(m.group(1)) / 100)
    df[col] = df[col].str.replace('([\d\.]+)%', repl)
    df[col] = pd.to_numeric(df[col], errors = 'ignore')
    
# Remove successfully converted numeric type cols from remaining cols.
num_cols = df.select_dtypes('float').columns.tolist()
remaining_cols = [col for col in remaining_cols if not col in num_cols]

# Print a status update.
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: float64(44), object(139)
memory usage: 2.8+ MB

118 columns need to be cleaned.


In [5]:
# Convert remaining cols with low number of unique vals to categorical cols.
for col in remaining_cols:
    if df[col].nunique() < 30:
        df[col] = df[col].astype('category')

# Remove successfully converted categorical cols from remaining cols.
cat_cols = df.select_dtypes('category').columns.tolist()
remaining_cols = [col for col in remaining_cols if not col in cat_cols]

# Print a status update.
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: category(58), float64(44), object(81)
memory usage: 2.1+ MB

60 columns need to be cleaned.


In [6]:
# If remaining col values are mostly non numeric, designate as string col.
for col in remaining_cols:
    if df[col].str.count('(\d+\.*\d*)').mode()[0] == 0:
        string_cols.append(col)

# Remove columns designated as string cols from remaining cols.
remaining_cols = [col for col in remaining_cols if not col in string_cols]

# Print a status update.
print("{} columns designated as string columns.\n".format(len(string_cols)))
df.info()
print("\n{} columns need to be cleaned.".format(len(remaining_cols)))

25 columns designated as string columns.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 183 entries, Name to Disciplines Pursued
dtypes: category(58), float64(44), object(81)
memory usage: 2.1+ MB

47 columns need to be cleaned.


In [7]:
# Create copy of df with all numeric values and extra whitespace stripped.
stripped_df = pd.DataFrame(index = df.index)
for col in remaining_cols:
    stripped_df[col] = df[col].str.replace('\d+\.*\d*', '#')
    stripped_df[col] = stripped_df[col].str.replace('\s+', ' ')
    stripped_df[col] = stripped_df[col].str.strip()

# Split the remaining cols into clean and dirty dfs, where a clean col is one
# composed only of values with consistent stripped text and is greater than
# 50% of the total column values.
clean_df = pd.DataFrame(index = df.index)
dirty_df = pd.DataFrame(index = df.index)
for col in remaining_cols:
    stripped_mode = stripped_df[col].mode()[0]
    mode_vals = (stripped_df[col] == stripped_mode)
    if mode_vals.sum() / df[col].count() > 0.5:
        clean_df[col] = df[col].where(mode_vals)
        dirty_df[col] = df[col].mask(mode_vals)

# Handle the clean columns.
extracted_df = pd.DataFrame(index = df.index)
for col in clean_df.columns:
    vals = clean_df[col].str.extractall('(\d+\.*\d*)').unstack()
    vals.columns = [col + ' - ' + str(i) for i in range(1, vals.shape[1] + 1)]
    for val_col in vals.columns:
        extracted_df[val_col] = pd.to_numeric(vals[val_col])
df = df.join(extracted_df)
        
# Handle the dirty columns.        
dirty_df.dropna(how = 'all', axis = 1, inplace = True)
dirty_df = dirty_df.add_suffix(' (dirty)')
df = df.join(dirty_df)

dirty_cols = dirty_df.columns
remaining_cols = [col for col in remaining_cols \
                                              if col not in clean_df.columns]

df.rename(columns = col_rename_dict, inplace = True)

df.drop(columns = clean_df.columns, inplace = True)

# Print a status update.
print("{} columns split into {} numeric and {} other uncleaned columns.\n"\
      .format(clean_df.shape[1], extracted_df.shape[1], dirty_df.shape[1]))
df.info()
print("\n{} original columns and {} dirty columns need to be cleaned."\
      .format(len(remaining_cols), len(dirty_cols)))

42 columns split into 97 numeric and 33 other uncleaned columns.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 267 entries, Name to Average Starting Salary (dirty)
dtypes: category(58), float64(137), object(72)
memory usage: 3.4+ MB

5 original columns and 33 dirty columns need to be cleaned.


In [8]:
regexs = ['2-9 students: ([\d\.]+)', '10-19 students: ([\d\.]+)',
          '20-29 students: ([\d\.]+)', '30-39 students: ([\d\.]+)',
          '40-49 students: ([\d\.]+)', '50-99 students: ([\d\.]+)',
          'Over 100 students: ([\d\.]+)']

class_labels = ['Class Size (% 2-9)', 'Class Size (% 10-19)',
                'Class Size (% 20-29)', 'Class Size (% 30-39)',
                'Class Size (% 40-49)', 'Class Size (% 50-99)',
                'Class Size (% 100+)']

lab_labels = ['Lab Size (% 2-9)', 'Lab Size (% 10-19)', 'Lab Size (% 20-29)', 
              'Lab Size (% 30-39)', 'Lab Size (% 40-49)', 
              'Lab Size (% 50-99)', 'Lab Size (% 100+)']

class_dict = dict(zip(regexs, class_labels))
lab_dict = dict(zip(regexs, lab_labels))
extract_dict = {'Regular Class Size': class_dict,
                'Discussion Section/Lab Class Size': lab_dict}

for col, dictionary in extract_dict.items():
    for regex, label in dictionary.items():
        df[label] = df[col].str.extract(regex)
        df[label] = pd.to_numeric(df[label])

    df.drop(columns = col, inplace = True)
    remaining_cols.remove(col)

print("2 columns split into 14 numeric columns.\n".format())
df.info()
print("\n{} original columns and {} dirty columns need to be cleaned."\
      .format(len(remaining_cols), len(dirty_cols)))

2 columns split into 14 numeric columns.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 279 entries, Name to Lab Size (% 100+)
dtypes: category(58), float64(151), object(70)
memory usage: 3.6+ MB

3 original columns and 33 dirty columns need to be cleaned.


In [9]:
cols = ['Nearest Airport', 'Nearest Train Station', 'Nearest Bus Station']
regex = '(\d+).* in (\D*)'
for col in cols:
    extracted = df[col].str.extract(regex)
    df[col + ' (miles)'] = pd.to_numeric(extracted[0])
    df[col + ' (city)'] = extracted[1]
    
    df.drop(columns = col, inplace = True)
    
    string_cols.append(col + ' (miles)')
    num_cols.append(col + ' (city)')
    remaining_cols.remove(col)
    
print("3 columns split into 3 numeric columns and 3 string columns.\n")
df.info()
print("\n{} original columns and {} dirty columns need to be cleaned."\
      .format(len(remaining_cols), len(dirty_cols)))

3 columns split into 3 numeric columns and 3 string columns.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2028 entries, 6 to 3379
Columns: 282 entries, Name to Nearest Bus Station (city)
dtypes: category(58), float64(154), object(70)
memory usage: 3.6+ MB

0 original columns and 33 dirty columns need to be cleaned.
