# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import csv

In [71]:
def preprocess(filepath):
    df = pd.read_csv(filepath)
    attributes = df.columns
    nominalvalues = {}
    
    df = df.replace('N/A', np.NaN)
    # df = df.replace('?', np.NaN)
    for col in df.columns:
        # deal with the $ and ,
        if df[col].dtype == "object":
            df[col] = df[col].str.replace("$", "")
            df[col] = df[col].str.replace(",", "")
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                print("%r is string." %col)
        # deal with missing values
        if sum(pd.isnull(df[col])) != 0 or sum(df[col].isin(["?"])) > 0:
            if df[col].dtype == "object":
                md = df[df[col] != np.NaN][col].mode()[0]
                df[col] = df[col].replace(np.NaN, md)
                
            else:
                mn = df[col].astype(float).mean()
                df[col] = df[col].replace(np.NaN, mn)
        if df[col].dtype == "object":
            nominalvalues[col] = df[col].unique().tolist()
            df[col] = df[col].astype(str)
    
    # generate arff file
    with open(filepath+".arff", "w") as f:
        f.write("@relation %r\n\n" %filepath)
        for attr in attributes:
            if attr in nominalvalues.keys():
                nv = "\",\"".join(nominalvalues[attr])
                nv = "\"" + nv + "\""
                f.write("@attribute " + attr.replace(" ", "_") + " {" + nv +"}\n")
            else:
                f.write("@attribute " + attr.replace(" ", "_") + " numeric\n")
        f.write("\n")
        f.write("@data\n")
        df.to_csv(filepath+".csv", header=False, index=False, quoting=csv.QUOTE_NONNUMERIC)
        with open(filepath+".csv") as nf:
            for line in nf:
                f.write(line)
    f.close()
    nf.close()
    
    return df, nominalvalues

In [72]:
dtpb, dtpb_nom = preprocess("college-salaries/degrees-that-pay-back.csv")
sbct, sbct_nom = preprocess("college-salaries/salaries-by-college-type.csv")
sbr, sbr_nom = preprocess("college-salaries/salaries-by-region.csv")

'Undergraduate Major' is string.
'School Name' is string.
'School Type' is string.
'School Name' is string.
'Region' is string.


# Exploration

In [76]:
dtpb.describe()

Unnamed: 0,Starting Median Salary,Mid-Career Median Salary,Percent change from Starting to Mid-Career Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,44310.0,74786.0,69.274,43408.0,55988.0,102138.0,142766.0
std,9360.866217,16088.40386,17.909908,12000.779567,13936.951911,20636.789914,27851.249267
min,34000.0,52000.0,23.4,26700.0,36500.0,70500.0,96400.0
25%,37050.0,60825.0,59.125,34825.0,44975.0,83275.0,124250.0
50%,40850.0,72000.0,67.8,39400.0,52450.0,99400.0,145500.0
75%,49875.0,88750.0,82.425,49850.0,63700.0,118750.0,161750.0
max,74300.0,107000.0,103.5,71900.0,87300.0,145000.0,210000.0


In [78]:
sbct.describe()

Unnamed: 0,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
count,269.0,269.0,269.0,269.0,269.0,269.0
mean,46068.401487,83932.342007,44250.649351,60373.234201,116275.092937,157705.627706
std,6412.616242,14336.191107,8077.811281,11381.348857,22952.334054,32260.199283
min,34800.0,43900.0,22600.0,31800.0,60900.0,87600.0
25%,42000.0,74000.0,39600.0,53200.0,100000.0,138000.0
50%,44700.0,81600.0,44000.0,58400.0,113000.0,157705.627706
75%,48300.0,92200.0,46400.0,65100.0,126000.0,165000.0
max,75500.0,134000.0,80000.0,104000.0,234000.0,326000.0


In [79]:
sbr.describe()

Unnamed: 0,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
count,320.0,320.0,320.0,320.0,320.0,320.0
mean,46253.4375,83934.375,45253.113553,60614.0625,116496.875,160442.124542
std,6617.038001,15191.443091,7906.907391,11786.436432,24104.265214,33967.918922
min,34500.0,43900.0,25600.0,31800.0,60900.0,85700.0
25%,42000.0,73725.0,40100.0,53100.0,99825.0,138000.0
50%,45100.0,82700.0,45253.113553,59400.0,113000.0,160000.0
75%,48900.0,93250.0,47925.0,66025.0,129000.0,171000.0
max,75500.0,134000.0,80000.0,104000.0,234000.0,326000.0


# sklearn

## degrees-that-pay-back.csv dtpb

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering, DBSCAN
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import fowlkes_mallows_score

In [None]:
# TASK: Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', LinearSVC(C=1000)),
    ])
# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
    }
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train, y_train)


In [75]:
dtpb.describe()

Unnamed: 0,Starting Median Salary,Mid-Career Median Salary,Percent change from Starting to Mid-Career Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,44310.0,74786.0,69.274,43408.0,55988.0,102138.0,142766.0
std,9360.866217,16088.40386,17.909908,12000.779567,13936.951911,20636.789914,27851.249267
min,34000.0,52000.0,23.4,26700.0,36500.0,70500.0,96400.0
25%,37050.0,60825.0,59.125,34825.0,44975.0,83275.0,124250.0
50%,40850.0,72000.0,67.8,39400.0,52450.0,99400.0,145500.0
75%,49875.0,88750.0,82.425,49850.0,63700.0,118750.0,161750.0
max,74300.0,107000.0,103.5,71900.0,87300.0,145000.0,210000.0
