In [1]:
import pandas as pd
import numpy as np
import spacy

from sklearn.cluster import KMeans

In [2]:
# df = pd.read_csv("./data.csv")
df = pd.read_csv("./small_data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,doc_title,doc_id,turn,speaker,text,speaker_bioguide,date
0,0,TRIBUTE TO IAN JACKSON,CREC-2019-02-26-pt1-PgS1487-5,0,Mr. RUBIO,"Mr. RUBIO. Madam President, today I honor Ian...",R000595,2019-02-26
1,1,REASSURING EUROPEAN ALLIES,CREC-2019-02-26-pt1-PgH2110-6,0,Mr. WILSON of South Carolina,"Mr. WILSON of South Carolina. Mr. Speaker, l...",W000795,2019-02-26
2,2,HONORING THE LIFE AND LEGACY OF ANNE MARIE LAB...,CREC-2019-02-26-pt1-PgE209-5,0,Mr. HASTINGS,"Mr. HASTINGS. Madam Speaker, I rise today to...",H000324,2019-02-26
3,3,RECOGNIZING THE 27TH ANNIVERSARY OF THE KHOJAL...,CREC-2019-02-26-pt1-PgE212-6,0,Mr. COHEN,"Mr. COHEN. Madam Speaker, this week marks th...",C001068,2019-02-26
4,4,ENSURE HEALTH PLANS COVER MEDICALLY NECESSARY ...,CREC-2019-02-26-pt1-PgH2111-2,0,Mr. RIGGLEMAN,"Mr. RIGGLEMAN. Mr. Speaker, I rise today to ...",R000611,2019-02-26


In [4]:
df.speaker_bioguide.unique().shape

(314,)

In [5]:
df.count()

Unnamed: 0          1659
doc_title           1659
doc_id              1659
turn                1659
speaker             1659
text                1659
speaker_bioguide    1109
date                1659
dtype: int64

In [6]:
def preprocess_text(text):
    return ' '.join(text.strip().split())

In [7]:
def get_vector(doc):
    return list(doc.vector)

In [8]:
first = df.text[1]

In [9]:
speaker = df.speaker[1]

In [10]:
processed = ' '.join(first.replace(speaker + '.', '', 1).strip().split())

In [11]:
nlp = spacy.load('en')

In [12]:
doc = nlp(processed)

In [13]:
df['doc'] = df.text.apply(preprocess_text).apply(nlp)

In [67]:
df['doc_vector'] = df.doc.apply(get_vector)

In [73]:
mat = np.matrix(df.doc_vector.tolist())

In [74]:
mat.shape

(1659, 384)

In [92]:
kmeans = KMeans(n_clusters=30, random_state=0).fit(mat)

In [93]:
kmeans.labels_

array([ 2, 22,  2, ..., 14, 17, 17], dtype=int32)

In [80]:
kmeans.cluster_centers_.shape

(10, 384)

In [91]:
kmeans.score(mat)

-490.3252

In [94]:
df['label'] = kmeans.labels_

In [141]:
categories = """Abortion
Budget & Economy & Jobs
Civil Rights
Corporations
Crime & Drugs
Education
Energy & Oil & Environment
Families & Children
Foreign Policy & Military
Free Trade
Government Reform
Gun Control
Health Care
Homeland Security & Immigration
Social Security & Tax Reform
Technology
Welfare & Poverty
Other""".split("\n")

In [142]:
categories

['Abortion',
 'Budget & Economy & Jobs',
 'Civil Rights',
 'Corporations',
 'Crime & Drugs',
 'Education',
 'Energy & Oil & Environment',
 'Families & Children',
 'Foreign Policy & Military',
 'Free Trade',
 'Government Reform',
 'Gun Control',
 'Health Care',
 'Homeland Security & Immigration',
 'Social Security & Tax Reform',
 'Technology',
 'Welfare & Poverty',
 'Other']

In [180]:
keywords = [
    "abortion, pro-life, pro-choice, maternal right, sanctity of life, trimester, fetal, fetus",
    "budget, economy, jobs",
    "civil rights, gay, lgbt, lesbian, queer, transgender, bisexual, homosexual, racial, discrimination",
    "------------------asdf-----------------",
    "drug, cocaine, marijuana, meth, weed",
    "education, school, university, college",
    "energy, oil, environment, natural gas, renewable, global warming, climate change",
    "domestic violence, child abuse",
    "pakistan, israel, palestine, russia, china, international, alliance, military, troops, nato",
    "nafta, free trade, trade war",
    "government reform",
    "gun control",
    "health care, affordable care act, health insurance, medicare, medicaid, health",
    "immigration, immigrant",
    "social security, tax reform",
    "technology, artificial intelligence, google, apple, facebook, amazon, microsoft, internet",
    "welfare, poverty, food stamp",
    "to honor, i honor"
]

In [181]:
kw_to_cat = {kw.strip(): categories[i] for i, kw_list in enumerate(keywords) for kw in kw_list.split(",") }

In [182]:
def keyword_categorize(text):
    cat_list = set()
    for (kw, cat) in kw_to_cat.items():
        if (kw in text.lower()):
            cat_list.add(cat)
    return cat_list

In [183]:
def kc_count(categories):
    return len(categories)

In [184]:
def keyword_single_category(text):
    cat_list = set()
    for (kw, cat) in kw_to_cat.items():
        if (kw in text.lower()):
            cat_list.add(cat)
    return cat_list.pop() if len(cat_list) == 1 else None

In [185]:
df['cat_list'] = df['text'].apply(keyword_categorize)

In [186]:
df['category'] = df['text'].apply(keyword_single_category)

In [187]:
df['cat_count'] = df['cat_list'].apply(kc_count)

In [188]:
df['category'].count()

356

In [189]:
df[df['cat_count'] == 1].groupby('category').count()

Unnamed: 0_level_0,Unnamed: 0,doc_title,doc_id,turn,speaker,text,speaker_bioguide,date,doc,doc_vector,label,cat_list,cat_count
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Abortion,10,10,10,10,10,10,10,10,10,10,10,10,10
Budget & Economy & Jobs,12,12,12,12,12,12,12,12,12,12,12,12,12
Civil Rights,5,5,5,5,5,5,5,5,5,5,5,5,5
Crime & Drugs,37,37,37,37,37,37,37,37,37,37,37,37,37
Education,65,65,65,65,65,65,63,65,65,65,65,65,65
Energy & Oil & Environment,22,22,22,22,22,22,17,22,22,22,22,22,22
Families & Children,11,11,11,11,11,11,11,11,11,11,11,11,11
Foreign Policy & Military,148,148,148,148,148,148,61,148,148,148,148,148,148
Gun Control,3,3,3,3,3,3,3,3,3,3,3,3,3
Health Care,18,18,18,18,18,18,17,18,18,18,18,18,18


In [161]:
df[df['cat_count'] > 1].count()

Unnamed: 0          346
doc_title           346
doc_id              346
turn                346
speaker             346
text                346
speaker_bioguide    342
date                346
doc                 346
doc_vector          346
label               346
cat_list            346
cat_count           346
dtype: int64

In [None]:
df[df['cat_count'] == 0].count()

In [162]:
df['text'][4]

"  Mr. RIGGLEMAN. Mr. Speaker, I rise today to speak for Kannon Koser, a \nconstituent and friend of mine who is with his family in the gallery \nthis afternoon.\n  Mr. Speaker, 3-year-old Kannon is one of 4 percent of children in the \nUnited States who is born with a congenital anomaly--in his case, \nhypohidrotic ectodermal dysplasia.\n  Most health plans provide coverage for congenital anomalies, and many \nStates require insurers to provide coverage of any health services \nrelated to congenital anomalies.\n  Despite this, health plans systematically and routinely deny claims \nand appeals for any oral or dental related procedures under the \npretense that such service is merely cosmetic.\n  That is why Representative Collin Peterson and I have joined together \nto introduce the Ensuring Lasting Smiles Act, a bipartisan bill that \nwould directly address this issue and ensure that we don't have to tell \nchildren like Kannon that their health condition isn't valued by \ninsurance 

In [193]:
labels = []


from IPython.display import clear_output

for row in df[df['cat_count'] > 1][['text', 'cat_list']].itertuples():
    t = row[1]
    cats = list(row[2])
    print(t)
    prompt = '\n'.join(['{}) {}'.format(index + 1, category) for index, category in enumerate(cats)]) + '\n'
    label = cats[int(input(prompt)) - 1]
    labels.append(label)
    clear_output()
    

  Mr. HASTINGS. Madam Speaker, I rise today to honor the life and 
legacy of Anne Marie Labelle. Anne was a wonderful mother, sister, 
grandmother, great grandmother and aunt. She was a selfless community 
advocate and will always be remembered for her strength and love of 
family.
  Anne was born in Troy, New York before moving to Broward County, 
Florida as a teenager. After high school, Anne became known for her 
activism in Sunrise city politics, before retiring far too young after 
suffering a stroke in 1982. In typical Anne fashion, she fought hard in 
rehab and recovered to work for local real estate developers in 
Plantation and Boca Raton. Anne was appreciative of those who helped 
her in her time of need, so she started volunteering at the Sunrise 
Rehabilitation Center and became a popular one-on-one volunteer for 
victims of stroke rehabilitation and brain injuries. She served on the 
hospital's community advisory board, while also volunteering with 
Shake-a-Leg, a group fo

KeyboardInterrupt: 

In [177]:
for row in df[df['cat_count'] > 1][['text', 'cat_list']].itertuples():
    print(row)

Pandas(Index=0, text=" Mr. RUBIO. Madam President, today I honor Ian Jackson, the \nVolusia County Teacher of the Year from T. Dewitt Taylor Middle-High \nSchool in Pierson, FL.\n  Ian is an Advancement Via Individual Determination teacher, working\n\n\nwith students from 8th to 12th grade and considers it his job to change \nthe trajectories of his students for the better. After receiving this \naward, Ian noted that it was not just him being recognized, but also \nhis students for their success.\n  Ian urges his students to strive for greatness in their middle school \nand high school coursework in preparation for the college workload. He \nfocuses on ensuring his classroom feels like a second home to his \nstudents when they struggle and are in need of support.\n  Many of Ian's students come from difficult circumstances, so he works \nto establish strong relationships and create a positive environment for \nthem. He dedicates his time to listening to the needs of his students \nand 

In [194]:
# Billy use this to label
labels = {}


from IPython.display import clear_output

for row in df[df['cat_count'] == 0][['text', 'cat_list']].itertuples():
    idx = row[0]
    t = row[1]
    cats = row[2]
    print(t)
    prompt = '\n'.join(['{}) {}'.format(index + 1, category) for index, category in enumerate(categories)]) + '\n'
    label = categories[int(input(prompt)) - 1]
    labels[idx] = label
    clear_output()

  The PRESIDING OFFICER. Pursuant to rule XXII, the Chair lays before 
the Senate the pending cloture motion, which the clerk will state.


KeyboardInterrupt: 

In [None]:
import pickle as pkl
pkl.dump(labels, open('./labels.pkl'))

In [208]:
# Run this after
for idx, label in labels.items():
    df.loc[idx, 'category'] = label

In [None]:
# save to new csv
df.to_csv('./new_data.csv')