####  Test Out FuzzyPandas Functionality

In [36]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import fuzzypandas as fp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Read In U.S. News Data

In [19]:
def stack_us_news(dir):
    '''
    Stack the individual scraped US News files into a single dataframe
    '''
    # get a list of files in directory
    files = os.listdir(dir)
    files = [file for file in files if '.pickle' in file]
    df = pd.DataFrame()
    
    # loop through files and append to a master dataframe
    for file in files:
        print('Importing: {file}'.format(file=file))
        chunk = pd.read_pickle('{dir}/{file}'.format(dir=dir, file=file))
        df = pd.concat([df, chunk], axis=0)
        
    # Separate out city and state from location
    df["city"] = df.location.apply(lambda x: x.split(", ")[0].upper().strip())
    df["state"] = df.location.apply(lambda x: x.split(", ")[1].upper().strip())
    df["school"] = df.school.apply(lambda x: x.upper().strip())

    del df['location']
    return df

In [4]:
# usnews = stack_us_news('testdata/us_news/')
# usnews.to_pickle('testdata/us_news/usnews.pkl')

usnews = pd.read_pickle('testdata/us_news/usnews.pkl')
print(usnews.shape)
usnews.head()

(1506, 5)


Unnamed: 0,category,school,score,city,state
0,Regional Colleges North,UNITED STATES COAST GUARD ACADEMY,100 out of 100.,NEW LONDON,CT
1,Regional Colleges North,COOPER UNION,94 out of 100.,NEW YORK,NY
2,Regional Colleges North,UNITED STATES MERCHANT MARINE ACADEMY,74 out of 100.,KINGS POINT,NY
3,Regional Colleges North,ELIZABETHTOWN COLLEGE,67 out of 100.,ELIZABETHTOWN,PA
4,Regional Colleges North,MESSIAH COLLEGE,65 out of 100.,MECHANICSBURG,PA


#### Read in IPEDS Data (IPEDS can be downloaded for free [here](https://nces.ed.gov/ipeds/datacenter/Default.aspx))

In [5]:
ipeds = pd.read_csv("testdata/ipeds/HD2014.csv", encoding='latin1')
ipeds = ipeds[["UNITID","INSTNM","CITY","STABBR"]]
ipeds.rename(columns={"UNITID": "unitid",
                      "INSTNM" :"school",
                      "CITY": "city",
                      "STABBR": "state"}, inplace=True) 

# Convert keys to uppercase & strip blankspace
ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip())
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) 
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip())

print(ipeds.shape)
ipeds.head()

(7687, 4)


Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


#### TEST: Subset to only CT schools

In [8]:
usnews=usnews[usnews.state=='CT'].copy()
ipeds=ipeds[ipeds.state=='CT'].copy()

print(usnews.shape)
print(ipeds.shape)

(21, 5)
(97, 4)


In [37]:
pairs = fp.matcher(usnews, ipeds, on=['state', 'city', 'school'], score_cutoff=60)

print(pairs.shape)
pairs.head()

2018-08-01 00:00:21,807 - fuzzypandas - INFO - Starting Pair Scoring for 6111 combinations.
2018-08-01 00:00:22,079 - fuzzypandas - INFO - Completed pair matching.


(21, 10)


Unnamed: 0,avg_match_score,city,city_match_score,city_matched,school,school_match_score,school_matched,state,state_match_score,state_matched
1268,100.0,BRIDGEPORT,100,BRIDGEPORT,UNIVERSITY OF BRIDGEPORT,100,UNIVERSITY OF BRIDGEPORT,CT,100,CT
1414,100.0,DANBURY,100,DANBURY,WESTERN CONNECTICUT STATE UNIVERSITY,100,WESTERN CONNECTICUT STATE UNIVERSITY,CT,100,CT
309,100.0,FAIRFIELD,100,FAIRFIELD,FAIRFIELD UNIVERSITY,100,FAIRFIELD UNIVERSITY,CT,100,CT
531,100.0,FAIRFIELD,100,FAIRFIELD,SACRED HEART UNIVERSITY,100,SACRED HEART UNIVERSITY,CT,100,CT
433,100.0,HAMDEN,100,HAMDEN,QUINNIPIAC UNIVERSITY,100,QUINNIPIAC UNIVERSITY,CT,100,CT


In [22]:
[True] * 3 + [False]

[True, True, True, False]

In [None]:
testA = us_news[["school","category"]].copy()
testB = ipeds[["school", "unitid"]].copy()

In [None]:
testA.head(2)

In [None]:
testB.head(2)

In [None]:
test = fp.fuzzy_merge(us_news, ipeds, on=["state","city","school"])

In [None]:
test.head(10)

In [None]:
xwalk.head(25)

In [None]:
from fuzzywuzzy import fuzz, process

def basic_match(a, b, keyA, keyB, cutoff, quickmatch):

    title = "Fuzzy Matching Results"
    #bar = pyprind.ProgBar(50, monitor=True, title=title)

    return b[keyB].unique()

In [None]:
test = basic_match(us_news, ipeds, keyA="school", keyB="school", cutoff=60, quickmatch=True)

In [None]:
print test[:10]

In [None]:
match = process.extractOne("YALE UNIVERSITY", test, scorer=fuzz.ratio, score_cutoff=60)
print match


In [None]:
from fuzzywuzzy import process

match = process.extractOne("YALE", ["YALE","HARVARD","PRINCETON"], scorer=fuzz.ratio, score_cutoff=60)
print match

In [None]:
print nomatch_a.shape

In [None]:
from fuzzywuzzy import fuzz, process
value = "Leroy"
possible = ["L", "2", "3"]

match = process.extractOne(value, possible, score_cutoff=99)

In [None]:
print match

In [None]:
match = "Not None"

if match == None:
    print True
else:
    print False

In [None]:
l = 5

if type(l) == list:
    print True
else:
    print False

#### Fix This

In [None]:
us_news.head(5)

In [None]:
us_news["test"] = us_news["state"]

In [None]:
us_news

In [None]:
print us_news.info()