# Convote Dataset - Basic Preprocessing
- Data Main Page: http://www.cs.cornell.edu/home/llee/data/convote.html 
- About the Data: http://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt


In [None]:
import sys
import pandas as pd
import os
import string
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import timeit
from sklearn import metrics
from sklearn.metrics import plot_precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


# Access to fasstext transform
sys.path.insert(0, '/Users/jilljenkins/Desktop/NLP/Project/word_embedding')
from data import transform_data_fasttext

import warnings
warnings.filterwarnings('ignore')

In [None]:
# For now, we will use the data from stage one
train_path = ('./convote_v1.1/data_stage_one/training_set/')
test_path = ('./convote_v1.1/data_stage_one/test_set/')
train_file_names = os.listdir(train_path)
test_file_names = os.listdir(test_path)

# Create Dictionary for File Name and Text
file_name_and_text = {}
for file in train_file_names:
    with open(train_path + file, 'r') as target_file:
         file_name_and_text[file] = target_file.read()

# Structure the dataframe such that the file name is the index
train_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

# This is redundant but fastest given the data is partitioned into directories
file_name_and_text = {}
for file in test_file_names:
    with open(test_path + file, 'r') as target_file:
         file_name_and_text[file] = target_file.read()

test_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

In [None]:
# Remove file extension
train_data['File'] = train_data['File'].map(lambda x: x.replace('.txt', ''))
test_data['File'] = test_data['File'].map(lambda x: x.replace('.txt', ''))

In [None]:
# Add Label feature (derived from end of file name)
Label = []

for i in train_data.File:
    Label.append(i[-3:])
    
train_data['Label'] = Label

### 'Label' feature Details 
- 'P' is replaced by a party indicator, D or R (or X if no
   corresponding party could be found).  As mentioned in the paper, we 
   purposely *did not* use this information in our experiments.

- 'M' is replaced by an indicator of whether the bill under
   discussion is mentioned directly in the speech segment, or whether it is
   only referenced by another speech segment on the same page.  If the bill is
   directly mentioned in the current speech, the letter M appears in
   the file name; otherwise, the letter O appears.

- 'V' is replaced by a vote indicator, Y or N, which serves as the
   ground-truth label for the speech.

In [None]:
# Split Label into three columns
train_data['Party'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
train_data['Discussion'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
train_data['Vote'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [None]:
train_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,282_400436_1413023_DMN,"mr. speaker , i would like to say a word about...",DMN,D,M,N
1,088_400272_2994052_DON,"mr. speaker , today we have some very clear ch...",DON,D,O,N
2,038_400080_0251064_DON,"mr. speaker , i yield myself such time as i ma...",DON,D,O,N
3,132_400227_0763073_DON,"mr. chairman , i yield back the balance of my ...",DON,D,O,N
4,282_400380_1838049_ROY,"madam chairman , will the gentleman yield ? \n",ROY,R,O,Y


In [None]:
# Test data has some Stage 2 files, not sure why - removed 2
Label = []
for i in test_data.File:
    label = i.rstrip('0123456789.- ')
    Label.append(label[-3:])

test_data['Label'] = Label

In [None]:
# Split label into distinct columns
test_data['Party'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
test_data['Discussion'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
test_data['Vote'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [None]:
test_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,414_400080_3170075_DON,"madam speaker , i yield myself 35 seconds . \n...",DON,D,O,N
1,414_400061_1909178_ROY,"mr. chairman , i demand a recorded vote . \n",ROY,R,O,Y
2,102_400175_0641038_ROY,"mr. speaker , i rise today as a cosponsor of h...",ROY,R,O,Y
3,414_400080_3170065_DON,"madam speaker , i yield myself 15 seconds . \n...",DON,D,O,N
4,472_400314_2238033_DON,"mr. speaker , i thank the ranking member , the...",DON,D,O,N


### Party Speech Distribution

In [None]:
print('Training Party Distribution:', '\n', train_data['Party'].value_counts(), '\n')
print('Test Party Distribution:', '\n', test_data['Party'].value_counts(), '\n')

Training Party Distribution: 
 D    2848
R    2786
I      26
Name: Party, dtype: int64 

Test Party Distribution: 
 R    891
D    863
I      5
Name: Party, dtype: int64 



### Sentence Count & Length

In [None]:
# Add column for sentence count
train_data['NumSents'] = train_data['Text'].apply(lambda x: len(x.strip().split('\n')))
test_data['NumSents'] = test_data['Text'].apply(lambda x: len(x.strip().split('\n')))

In [None]:
train_data['NumSents']
test_data['NumSents']

0        5
1        1
2       19
3        4
4       61
        ..
1754     1
1755     1
1756     1
1757     1
1758    39
Name: NumSents, Length: 1759, dtype: int64

# Tokenization

In [None]:
train_data['Text'][1]

"mr. speaker , today we have some very clear choices . \nit is not every day that we face such black and white options -- often the issues we debate on this floor have many shades of gray . \nbut today , there is no confusion , there is no muddying of the issues , and there is no way to mask the harm this bill would do : cut education spending for the first time in a decade , slash funding for worker and youth training , and provide no increase for home heating assistance for low-income families . \ntoday , we have a choice . \nwe can pass a bill that will be detrimental to our children 's future ; that will hurt students in need of financial assistance to go to college ; that will not help families struggling to pay their heating bills ; and that will severely hinder research and preventive health efforts . \nor we can reject this bill and demand something better for american families . \nwe have heard that this bill is the result of priorities . \nwell , this is one point where i agr

In [None]:
# Phrases like: 'Mr. Chairman' & 'Mr. Speaker' appear frequently - not informative
# Add to custom stopwords list

### Custom Stopwords Creation

In [None]:
# Initialize builtin and custom stopwords
stopwords = nltk.corpus.stopwords.words('english')
customStopWords = ['mr', 'chairman','mr', 'speaker', 'madam', 'mr.', 
                   'yield', 'gentleman', 'gentlewoman', 'minutes', 'time']
stopwords.extend(customStopWords)

# Join stopwords and punctuation
punct = list(string.punctuation)
stops = stopwords + punct + ['--',"''", 'r.', '``', "'s", "n't"]

In [None]:
# Function to tokenize 
def tokenize_speech(text, party):
    if 'R' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    elif 'D' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    else:
        return [word for word in word_tokenize(text) if not word in stops]
        

In [None]:
train_data['Tokens'] = train_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)
test_data['Tokens'] = test_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)

### Demonstration of Speech tokenization and token count(s)

In [None]:
print(train_data['Tokens'][1], '\n\n')
print('Total tokens:', len(train_data['Tokens'][1]))
print('Total UNIQUE tokens:', len(set(train_data['Tokens'][1])))

['today', 'clear', 'choices', 'every', 'day', 'face', 'black', 'white', 'options', 'often', 'issues', 'debate', 'floor', 'many', 'shades', 'gray', 'today', 'confusion', 'muddying', 'issues', 'way', 'mask', 'harm', 'bill', 'would', 'cut', 'education', 'spending', 'first', 'decade', 'slash', 'funding', 'worker', 'youth', 'training', 'provide', 'increase', 'home', 'heating', 'assistance', 'low-income', 'families', 'today', 'choice', 'pass', 'bill', 'detrimental', 'children', 'future', 'hurt', 'students', 'need', 'financial', 'assistance', 'go', 'college', 'help', 'families', 'struggling', 'pay', 'heating', 'bills', 'severely', 'hinder', 'research', 'preventive', 'health', 'efforts', 'reject', 'bill', 'demand', 'something', 'better', 'american', 'families', 'heard', 'bill', 'result', 'priorities', 'well', 'one', 'point', 'agree', 'republican', 'colleagues', 'bill', 'result', 'priorities', 'wrong', 'priorities', 'republican', 'leadership', 'congress', 'content', 'spend', 'tax', 'cuts', 'ent

### Total Tokens & Unique Tokens for each Speech

In [None]:
# Create column of TOTAL token count per text
train_data['Total_tokens'] = train_data['Tokens'].apply(lambda x: len(x))
test_data['Total_tokens'] = test_data['Tokens'].apply(lambda x: len(x))

# Create column of UNIQUE token count per text
train_data['Unique_tokens'] = train_data['Tokens'].apply(lambda x: len(set(x)))
test_data['Unique_tokens'] = test_data['Tokens'].apply(lambda x: len(set(x)))

In [None]:
Rtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'R']))
Dtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'D']))
Itoks = (sum(train_data.Total_tokens[train_data['Party'] == 'I']))

RtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'R']))
DtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'D']))
ItoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'I']))

In [None]:
totalToks = Rtoks+Dtoks+Itoks
totalToksU = RtoksU+DtoksU+ItoksU

### Cumulative Total  & Unique Total Tokens

In [None]:
print(f'There are {totalToks} total words in the corpus', '\n')
print(f'There are {totalToksU} UNIQUE words in the corpus')

There are 707148 total words in the corpus 

There are 463060 UNIQUE words in the corpus


### Cumulative Total & Unique Total Tokens by Party

In [None]:
print(f'There are {Rtoks} words in the Republican speeches')
print(f'There are {Dtoks} words in the Democrat speeches')
print(f'There are {Itoks} words in the Independent speeches', '\n')

print(f'There are {RtoksU} UNIQUE words in the Republican speeches')
print(f'There are {DtoksU} UNIQUE words in the Democrat speeches')
print(f'There are {ItoksU} UNIQUE words in the Independent speeches')

There are 293632 words in the Republican speeches
There are 411302 words in the Democrat speeches
There are 2214 words in the Independent speeches 

There are 195415 UNIQUE words in the Republican speeches
There are 266259 UNIQUE words in the Democrat speeches
There are 1386 UNIQUE words in the Independent speeches


In [None]:
train_data.head(5)

Unnamed: 0,File,Text,Label,Party,Discussion,Vote,NumSents,Tokens,Total_tokens,Unique_tokens
0,282_400436_1413023_DMN,"mr. speaker , i would like to say a word about...",DMN,D,M,N,17,"[would, like, say, word, illinois, also, proba...",167,127
1,088_400272_2994052_DON,"mr. speaker , today we have some very clear ch...",DON,D,O,N,16,"[today, clear, choices, every, day, face, blac...",196,149
2,038_400080_0251064_DON,"mr. speaker , i yield myself such time as i ma...",DON,D,O,N,15,"[may, consume, would, like, briefly, describe,...",152,111
3,132_400227_0763073_DON,"mr. chairman , i yield back the balance of my ...",DON,D,O,N,1,"[back, balance]",2,2
4,282_400380_1838049_ROY,"madam chairman , will the gentleman yield ? \n",ROY,R,O,Y,1,[],0,0


In [None]:
test_data['Text'][0]

'madam speaker , i yield myself 35 seconds . \ni want to give chairman sensenbrenner the benefit of the presumption of a doubt about this section 215 business . \nwhat happens in the report is it makes it easier to get library and other records under section 215 by creating a presumption that records of anyone to come into contact with a suspected terrorist even accidentally , innocently , is relevant to an investigation . \nmadam speaker , what he has done is he has moved a part of section 215 to another part of the bill , and that is why it does not operate that way . \nmadam speaker , i yield 2 minutes to the gentleman from new york ( mr. nadler ) xz4002890 , a ranking subcommittee member of the judiciary committee . \n'

### Avg Number of Sentences by Party

In [None]:
RsentAvg = round((train_data.NumSents[train_data['Party'] == 'R']).mean(), 2)
DsentAvg = round((train_data.NumSents[train_data['Party'] == 'D']).mean(), 2)
IsentAvg = round((train_data.NumSents[train_data['Party'] == 'I']).mean(), 2)

print(f'Avg number of Republican Sentences/Speech: {RsentAvg}', '\n', 
      f'Avg number of Democrat Sentences/Speech: {DsentAvg}', '\n', 
      f'Avg number of Independent Sentences/Speech: {IsentAvg}')

Avg number of Republican Sentences/Speech: 9.97 
 Avg number of Democrat Sentences/Speech: 13.07 
 Avg number of Independent Sentences/Speech: 8.62


### Avg Number of Words by Party

In [None]:
AvgRtoks = (train_data.Total_tokens[train_data['Party'] == 'R']).mean()
AvgDtoks = (train_data.Total_tokens[train_data['Party'] == 'D']).mean()
AvgItoks = (train_data.Total_tokens[train_data['Party'] == 'I']).mean()

print(f'Avg number of Republican Words/Speech: {AvgRtoks}', '\n', 
      f'Avg number of Democrat Words/Speech: {AvgDtoks}', '\n', 
      f'Avg number of Independent Words/Speech: {AvgItoks}')

Avg number of Republican Words/Speech: 105.39554917444364 
 Avg number of Democrat Words/Speech: 144.41783707865167 
 Avg number of Independent Words/Speech: 85.15384615384616


In [None]:
test_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote,NumSents,Tokens,Total_tokens,Unique_tokens
0,414_400080_3170075_DON,"madam speaker , i yield myself 35 seconds . \n...",DON,D,O,N,5,"[35, seconds, want, give, sensenbrenner, benef...",53,46
1,414_400061_1909178_ROY,"mr. chairman , i demand a recorded vote . \n",ROY,R,O,Y,1,"[demand, recorded, vote]",3,3
2,102_400175_0641038_ROY,"mr. speaker , i rise today as a cosponsor of h...",ROY,R,O,Y,19,"[rise, today, cosponsor, h.r, 8, support, rule...",161,111
3,414_400080_3170065_DON,"madam speaker , i yield myself 15 seconds . \n...",DON,D,O,N,4,"[15, seconds, let, remind, friend, returned, c...",27,27
4,472_400314_2238033_DON,"mr. speaker , i thank the ranking member , the...",DON,D,O,N,61,"[thank, ranking, member, new, york, ms., slaug...",656,398


In [None]:
import spacy
nlp = spacy.load("en")

In [None]:
# adding additional words to stop words list 
addtl_stops = customStopWords + ['would', 'say', 'like']
for stopword in addtl_stops:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True
    
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_) for tok in doc
                  if tok.is_alpha and not tok.is_stop and not tok.like_num] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

In [None]:
%%time
train_data['lemmas'] = preprocess_pipe(train_data['Text'])

CPU times: user 1min 51s, sys: 20.7 s, total: 2min 12s
Wall time: 2min 12s


In [None]:
%%time
test_data['lemmas'] = preprocess_pipe(test_data['Text'])

CPU times: user 39.3 s, sys: 7.24 s, total: 46.6 s
Wall time: 46.9 s


# Pickle the training and test data for downstream import

In [None]:
train_data.to_pickle('./train_data.pkl')
test_data.to_pickle('./test_data.pkl')