### Test Out FuzzyPandas Functionality

In [1]:
%load_ext autoreload
%autoreload 2

import pyprind
import pandas as pd
import fuzzypandas as fp

#### Read In U.S. News Data

In [2]:
def read_us_news(pickle):
    df = pd.read_pickle(pickle)   
    return pd.concat([us_news, df], axis=0, ignore_index=True)

us_news = pd.DataFrame() #initialize empty data frame
us_news = read_us_news("testdata/us_news/usnews-ranking-national-universities.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-national-liberal-arts-colleges.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-colleges-midwest.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-colleges-north.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-colleges-south.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-colleges-west.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-universities-midwest.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-universities-north.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-universities-south.pickle")
us_news = read_us_news("testdata/us_news/usnews-ranking-regional-universities-west.pickle")

##Clean data, convert unicode to string
us_news["city"] = us_news.location.apply(lambda x: x.split(", ")[0].upper().strip().encode("ascii", "replace"))
us_news["state"] = us_news.location.apply(lambda x: x.split(", ")[1].upper().strip().encode("ascii", "replace"))
us_news["school"] = us_news.school.apply(lambda x: x.upper().strip().encode("ascii", "replace"))

us_news.drop("location", axis=1, inplace=True) #drop original location (now that we've split out into city and state)
us_news.head(5)

Unnamed: 0,category,school,score,city,state
0,National Universities,PRINCETON UNIVERSITY,100 out of 100.,PRINCETON,NJ
1,National Universities,HARVARD UNIVERSITY,99 out of 100.,CAMBRIDGE,MA
2,National Universities,YALE UNIVERSITY,97 out of 100.,NEW HAVEN,CT
3,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY
4,National Universities,STANFORD UNIVERSITY,95 out of 100.,STANFORD,CA


In [4]:
print us_news.name

AttributeError: 'DataFrame' object has no attribute 'name'

#### Read in IPEDS Data (IPEDS can be downloaded for free [here](https://nces.ed.gov/ipeds/datacenter/Default.aspx))

In [3]:
ipeds=pd.read_csv("testdata/ipeds/HD2014.csv")[["UNITID","INSTNM","CITY","STABBR"]]
ipeds.rename(columns={"UNITID":"unitid","INSTNM":"school","CITY":"city","STABBR":"state"}, inplace=True) 

##Convert keys to uppercase & strip blankspace
ipeds["school"] = ipeds.school.apply(lambda x: x.upper().strip())
ipeds["city"] = ipeds.city.apply(lambda x: x.upper().strip()) 
ipeds["state"] = ipeds.state.apply(lambda x: x.upper().strip())

print ipeds.shape
ipeds.head(5)

(7687, 4)


Unnamed: 0,unitid,school,city,state
0,100636,COMMUNITY COLLEGE OF THE AIR FORCE,MONTGOMERY,AL
1,100654,ALABAMA A & M UNIVERSITY,NORMAL,AL
2,100663,UNIVERSITY OF ALABAMA AT BIRMINGHAM,BIRMINGHAM,AL
3,100690,AMRIDGE UNIVERSITY,MONTGOMERY,AL
4,100706,UNIVERSITY OF ALABAMA IN HUNTSVILLE,HUNTSVILLE,AL


In [4]:
print type(us_news.school.tolist()[0])
print type(ipeds.school.tolist()[0])

<type 'str'>
<type 'str'>


#### Subset to only NY & CT schools

In [5]:
us_news=us_news[us_news.state.isin(["NY", "CT"])].copy()
ipeds=ipeds[ipeds.state.isin(["NY", "CT"])].copy()

print us_news.shape
print ipeds.shape

(142, 5)
(574, 4)


In [6]:
test = fp.basic_match(us_news.school.unique(), ipeds.school.unique(), cutoff=60, quickmatch=True)
test.head(5)

Unnamed: 0,keyA,keyB,match_score
0,YALE UNIVERSITY,YALE UNIVERSITY,100
1,COLUMBIA UNIVERSITY,COLGATE UNIVERSITY,81
2,CORNELL UNIVERSITY,CORNELL UNIVERSITY,100
3,NEW YORK UNIVERSITY,NEW YORK UNIVERSITY,100
4,UNIVERSITY OF ROCHESTER,UNIVERSITY OF ROCHESTER,100


In [11]:
testA = us_news[["school","category"]].copy()
testB = ipeds[["school", "unitid"]].copy()

In [8]:
testA.head(2)

Unnamed: 0,school,category
2,YALE UNIVERSITY,National Universities
3,COLUMBIA UNIVERSITY,National Universities


In [9]:
testB.head(2)

Unnamed: 0,school,unitid
668,ALBERTUS MAGNUS COLLEGE,128498
669,PAUL MITCHELL THE SCHOOL-DANBURY,128540


In [23]:
test = fp.fuzzy_merge(us_news, ipeds, on=["state","city","school"])

In [24]:
test.head(10)

Unnamed: 0,category_x,school_x,score_x,city_x,state_x,in_a_x,unitid_x,in_b_x,keyA,keyB,match_score,category_y,school_y,score_y,city_y,state_y,in_a_y,unitid_y,in_b_y
0,National Universities,COLUMBIA UNIVERSITY,95 out of 100.,NEW YORK,NY,1,,,NY NEW YORK COLUMBIA UNIVERSITY,NY NEW YORK ROCKEFELLER UNIVERSITY,77,,ROCKEFELLER UNIVERSITY,,NEW YORK,NY,,195049,1
1,National Universities,FORDHAM UNIVERSITY,54 out of 100.,NEW YORK,NY,1,,,NY NEW YORK FORDHAM UNIVERSITY,NY BRONX FORDHAM UNIVERSITY,81,,FORDHAM UNIVERSITY,,BRONX,NY,,191241,1
2,National Universities,BINGHAMTON UNIVERSITY??SUNY,46 out of 100.,BINGHAMTON,NY,1,,,NY BINGHAMTON BINGHAMTON UNIVERSITY??SUNY,NY BRONX FORDHAM UNIVERSITY,62,,FORDHAM UNIVERSITY,,BRONX,NY,,191241,1
3,National Universities,STONY BROOK UNIVERSITY??SUNY,46 out of 100.,STONY BROOK,NY,1,,,NY STONY BROOK STONY BROOK UNIVERSITY??SUNY,NY STONY BROOK STONY BROOK UNIVERSITY,93,,STONY BROOK UNIVERSITY,,STONY BROOK,NY,,196097,1
4,National Universities,UNIVERSITY AT BUFFALO??SUNY,44 out of 100.,BUFFALO,NY,1,,,NY BUFFALO UNIVERSITY AT BUFFALO??SUNY,NY BUFFALO UNIVERSITY AT BUFFALO,91,,UNIVERSITY AT BUFFALO,,BUFFALO,NY,,196088,1
5,National Universities,NEW SCHOOL,37 out of 100.,NEW YORK,NY,1,,,NY NEW YORK NEW SCHOOL,NY NEW YORK THE NEW SCHOOL,92,,THE NEW SCHOOL,,NEW YORK,NY,,193654,1
6,National Universities,UNIVERSITY AT ALBANY??SUNY,36 out of 100.,ALBANY,NY,1,,,NY ALBANY UNIVERSITY AT ALBANY??SUNY,NY ALBANY SUNY AT ALBANY,77,,SUNY AT ALBANY,,ALBANY,NY,,196060,1
7,National Universities,ST. JOHN FISHER COLLEGE,33 out of 100.,ROCHESTER,NY,1,,,NY ROCHESTER ST. JOHN FISHER COLLEGE,NY ROCHESTER SAINT JOHN FISHER COLLEGE,95,,SAINT JOHN FISHER COLLEGE,,ROCHESTER,NY,,195720,1
8,National Universities,ST. JOHN'S UNIVERSITY,31 out of 100.,QUEENS,NY,1,,,NY QUEENS ST. JOHN'S UNIVERSITY,NY QUEENS ST JOHN'S UNIVERSITY-NEW YORK,86,,ST JOHN'S UNIVERSITY-NEW YORK,,QUEENS,NY,,195809,1
9,National Universities,PACE UNIVERSITY,25 out of 100.,NEW YORK,NY,1,,,NY NEW YORK PACE UNIVERSITY,NY NEW YORK PACE UNIVERSITY-NEW YORK,86,,PACE UNIVERSITY-NEW YORK,,NEW YORK,NY,,194310,1


In [None]:
xwalk.head(25)

In [None]:
from fuzzywuzzy import fuzz, process

def basic_match(a, b, keyA, keyB, cutoff, quickmatch):

    title = "Fuzzy Matching Results"
    #bar = pyprind.ProgBar(50, monitor=True, title=title)

    return b[keyB].unique()

In [None]:
test = basic_match(us_news, ipeds, keyA="school", keyB="school", cutoff=60, quickmatch=True)

In [None]:
print test[:10]

In [None]:
match = process.extractOne("YALE UNIVERSITY", test, scorer=fuzz.ratio, score_cutoff=60)
print match


In [None]:
from fuzzywuzzy import process

match = process.extractOne("YALE", ["YALE","HARVARD","PRINCETON"], scorer=fuzz.ratio, score_cutoff=60)
print match

In [None]:
print nomatch_a.shape

In [None]:
from fuzzywuzzy import fuzz, process
value = "Leroy"
possible = ["L", "2", "3"]

match = process.extractOne(value, possible, score_cutoff=99)

In [None]:
print match

In [None]:
match = "Not None"

if match == None:
    print True
else:
    print False

In [None]:
l = 5

if type(l) == list:
    print True
else:
    print False

#### Fix This

In [None]:
us_news.head(5)

In [None]:
us_news["test"] = us_news["state"]

In [None]:
us_news

In [26]:
print us_news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 2 to 1236
Data columns (total 6 columns):
category    142 non-null object
school      142 non-null object
score       142 non-null object
city        142 non-null object
state       142 non-null object
in_a        142 non-null int64
dtypes: int64(1), object(5)
memory usage: 7.8+ KB
None
