## Convote Dataset - Basic Preprocessing
- Data Main Page: http://www.cs.cornell.edu/home/llee/data/convote.html 
- About the Data: http://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt


In [3]:
import pandas as pd
import os
import string
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, sent_tokenize
from nltk.probability import FreqDist

In [4]:
# For now, we will use the data from stage one
train_path = ('./convote_v1.1/data_stage_one/training_set/')
test_path = ('./convote_v1.1/data_stage_one/test_set/')
train_file_names = os.listdir(train_path)
test_file_names = os.listdir(test_path)

# Create Dictionary for File Name and Text
file_name_and_text = {}
for file in train_file_names:
    with open(train_path + file, 'r') as target_file:
         file_name_and_text[file] = target_file.read()

# Structure the dataframe such that the file name is the index
train_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

# This is redundant but fastest given the data is partitioned into directories
file_name_and_text = {}
for file in test_file_names:
    with open(test_path + file, 'r', encoding='UTF') as target_file:
         file_name_and_text[file] = target_file.read()

test_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

In [5]:
train_data.head()

Unnamed: 0,File,Text
0,006_400009_0002057_DON.txt,"mr. speaker , i rise in opposition to the rule..."
1,006_400011_0002002_DON.txt,"mr. speaker , i rise for a constitutional poin..."
2,006_400011_0002003_DMN.txt,"mr. speaker , the resolution we are preparing ..."
3,006_400011_0002007_DON.txt,"mr. speaker , consistent with the oath of offi..."
4,006_400011_0002008_DON.txt,"mr. speaker , on that i demand the yeas and na..."


In [6]:
test_data.head()

Unnamed: 0,File,Text
0,048_400008_0296010_DON.txt,"mr. chairman , i thank the gentleman from mich..."
1,048_400008_0297068_DON.txt,"mr. chairman , i thank the gentleman for yield..."
2,048_400009_0297022_DMN.txt,"mr. chairman , i rise in opposition to h.r. 27..."
3,048_400027_0297016_RMY.txt,"mr. chairman , i rise today in support of h.r...."
4,048_400029_0294001_ROY.txt,"mr. speaker , by direction of the committee on..."


In [7]:
# Remove file extension
train_data['File'] = train_data['File'].map(lambda x: x.replace('.txt', ''))
test_data['File'] = test_data['File'].map(lambda x: x.replace('.txt', ''))

In [8]:
# Add Label feature (derived from end of file name)
Label = []

for i in train_data.File:
    Label.append(i[-3:])
    
train_data['Label'] = Label

### 'Label' feature Details 
- 'P' is replaced by a party indicator, D or R (or X if no
   corresponding party could be found).  As mentioned in the paper, we 
   purposely *did not* use this information in our experiments.

- 'M' is replaced by an indicator of whether the bill under
   discussion is mentioned directly in the speech segment, or whether it is
   only referenced by another speech segment on the same page.  If the bill is
   directly mentioned in the current speech, the letter M appears in
   the file name; otherwise, the letter O appears.

- 'V' is replaced by a vote indicator, Y or N, which serves as the
   ground-truth label for the speech.

In [9]:
# Split Label into three columns
train_data['Party'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
train_data['Discussion'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
train_data['Vote'] = train_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [10]:
train_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,006_400009_0002057_DON,"mr. speaker , i rise in opposition to the rule...",DON,D,O,N
1,006_400011_0002002_DON,"mr. speaker , i rise for a constitutional poin...",DON,D,O,N
2,006_400011_0002003_DMN,"mr. speaker , the resolution we are preparing ...",DMN,D,M,N
3,006_400011_0002007_DON,"mr. speaker , consistent with the oath of offi...",DON,D,O,N
4,006_400011_0002008_DON,"mr. speaker , on that i demand the yeas and na...",DON,D,O,N


In [11]:
# Test data has some Stage 2 files, not sure why - removed 2
Label = []
for i in test_data.File:
    label = i.rstrip('0123456789.- ')
    Label.append(label[-3:])

test_data['Label'] = Label

In [12]:
# Split label into distinct columns
test_data['Party'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[0]
test_data['Discussion'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[1]
test_data['Vote'] = test_data['Label'].apply(lambda x: pd.Series(list(x)))[2]

In [13]:
test_data.head()

Unnamed: 0,File,Text,Label,Party,Discussion,Vote
0,048_400008_0296010_DON,"mr. chairman , i thank the gentleman from mich...",DON,D,O,N
1,048_400008_0297068_DON,"mr. chairman , i thank the gentleman for yield...",DON,D,O,N
2,048_400009_0297022_DMN,"mr. chairman , i rise in opposition to h.r. 27...",DMN,D,M,N
3,048_400027_0297016_RMY,"mr. chairman , i rise today in support of h.r....",RMY,R,M,Y
4,048_400029_0294001_ROY,"mr. speaker , by direction of the committee on...",ROY,R,O,Y


### Party Speech Distribution

In [14]:
print('Training Party Distribution:', '\n', train_data['Party'].value_counts(), '\n')
print('Test Party Distribution:', '\n', test_data['Party'].value_counts(), '\n')

Training Party Distribution: 
 D    2848
R    2786
I      26
Name: Party, dtype: int64 

Test Party Distribution: 
 R    891
D    863
I      5
Name: Party, dtype: int64 



### Sentence Count & Length

In [15]:
# Add column for sentence count
train_data['NumSents'] = train_data['Text'].apply(lambda x: len(x.strip().split('\n')))
test_data['NumSents'] = test_data['Text'].apply(lambda x: len(x.strip().split('\n')))

In [16]:
train_data['NumSents']
test_data['NumSents']

0       24
1       13
2       17
3       14
4        1
        ..
1754     9
1755     1
1756     2
1757     2
1758     1
Name: NumSents, Length: 1759, dtype: int64

# Tokenization

In [17]:
train_data['Text'][1]

'mr. speaker , i rise for a constitutional point of order . \n'

In [16]:
# Phrases like: 'Mr. Chairman' & 'Mr. Speaker' appear frequently - not informative
# Add to custom stopwords list

### Custom Stopwords Creation

In [18]:
# Initialize builtin and custom stopwords
stopwords = nltk.corpus.stopwords.words('english')
customStopWords = ['mr', 'chairman','mr', 'speaker', 'madam', 'mr.']
stopwords.extend(customStopWords)

# Join stopwords and punctuation
punct = list(string.punctuation)
stops = stopwords + punct + ['--',"''", 'r.', '``', "'s", "n't"]

In [19]:
# Function to tokenize 
def tokenize_speech(text, party):
    if 'R' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    elif 'D' in party:
        return [word for word in word_tokenize(text) if not word in stops]
    else:
        return [word for word in word_tokenize(text) if not word in stops]
        

In [20]:
train_data['Tokens'] = train_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)
test_data['Tokens'] = test_data.apply(lambda x: tokenize_speech(x['Text'],x['Party']),axis=1)

### Demonstration of Speech tokenization and token count(s)

In [21]:
print(train_data['Tokens'][1], '\n\n')
print('Total tokens:', len(train_data['Tokens'][1]))
print('Total UNIQUE tokens:', len(set(train_data['Tokens'][1])))

['rise', 'constitutional', 'point', 'order'] 


Total tokens: 4
Total UNIQUE tokens: 4


### Total Tokens & Unique Tokens for each Speech

In [22]:
# Create column of TOTAL token count per text
train_data['Total_tokens'] = train_data['Tokens'].apply(lambda x: len(x))
test_data['Total_tokens'] = test_data['Tokens'].apply(lambda x: len(x))

# Create column of UNIQUE token count per text
train_data['Unique_tokens'] = train_data['Tokens'].apply(lambda x: len(set(x)))
test_data['Unique_tokens'] = test_data['Tokens'].apply(lambda x: len(set(x)))

In [23]:
Rtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'R']))
Dtoks = (sum(train_data.Total_tokens[train_data['Party'] == 'D']))
Itoks = (sum(train_data.Total_tokens[train_data['Party'] == 'I']))

RtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'R']))
DtoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'D']))
ItoksU = (sum(train_data.Unique_tokens[train_data['Party'] == 'I']))

In [24]:
totalToks = Rtoks+Dtoks+Itoks
totalToksU = RtoksU+DtoksU+ItoksU

### Cumulative Total  & Unique Total Tokens

In [25]:
print(f'There are {totalToks} total words in the corpus', '\n')
print(f'There are {totalToksU} UNIQUE words in the corpus')

There are 720022 total words in the corpus 

There are 472123 UNIQUE words in the corpus


### Cumulative Total & Unique Total Tokens by Party

In [26]:
print(f'There are {Rtoks} words in the Republican speeches')
print(f'There are {Dtoks} words in the Democrat speeches')
print(f'There are {Itoks} words in the Independent speeches', '\n')

print(f'There are {RtoksU} UNIQUE words in the Republican speeches')
print(f'There are {DtoksU} UNIQUE words in the Democrat speeches')
print(f'There are {ItoksU} UNIQUE words in the Independent speeches')

There are 300202 words in the Republican speeches
There are 417527 words in the Democrat speeches
There are 2293 words in the Independent speeches 

There are 200031 UNIQUE words in the Republican speeches
There are 270660 UNIQUE words in the Democrat speeches
There are 1432 UNIQUE words in the Independent speeches


In [27]:
train_data.head(5)

Unnamed: 0,File,Text,Label,Party,Discussion,Vote,NumSents,Tokens,Total_tokens,Unique_tokens
0,006_400009_0002057_DON,"mr. speaker , i rise in opposition to the rule...",DON,D,O,N,15,"[rise, opposition, rules, package, us, today, ...",146,109
1,006_400011_0002002_DON,"mr. speaker , i rise for a constitutional poin...",DON,D,O,N,1,"[rise, constitutional, point, order]",4,4
2,006_400011_0002003_DMN,"mr. speaker , the resolution we are preparing ...",DMN,D,M,N,8,"[resolution, preparing, consider, proposed, ru...",120,89
3,006_400011_0002007_DON,"mr. speaker , consistent with the oath of offi...",DON,D,O,N,1,"[consistent, oath, office, took, would, reques...",10,10
4,006_400011_0002008_DON,"mr. speaker , on that i demand the yeas and na...",DON,D,O,N,1,"[demand, yeas, nays]",3,3


In [28]:
test_data['Text'][0]

"mr. chairman , i thank the gentleman from michigan for yielding me this time . \ni am opposed to this bill because it reflects a misunderstanding of the proper way to build a successful career and a gross misinterpretation of our constitutional tradition . \nwith respect to its misunderstanding of the best way to build a career , i think that these personal retraining accounts , although clearly well intentioned , have exactly the wrong effect on an unemployed person . \nthe purpose of workforce investment is not to move a person from a position of unemployment to a position of employment for a while . \nthe purpose of the workforce investment is to move a person from dependency to opportunity and eventually to prosperity . \nthe great dividing line in the american economy is whether one has 2 years of college or not . \npeople with more than 2 years of college tend to have stable jobs and high and rising incomes . \nthis bill says to a person who is laid off from an industrial indust

### Avg Number of Sentences by Party

In [29]:
RsentAvg = round((train_data.NumSents[train_data['Party'] == 'R']).mean(), 2)
DsentAvg = round((train_data.NumSents[train_data['Party'] == 'D']).mean(), 2)
IsentAvg = round((train_data.NumSents[train_data['Party'] == 'I']).mean(), 2)

print(f'Avg number of Republican Sentences/Speech: {RsentAvg}', '\n', 
      f'Avg number of Democrat Sentences/Speech: {DsentAvg}', '\n', 
      f'Avg number of Independent Sentences/Speech: {IsentAvg}')

Avg number of Republican Sentences/Speech: 9.97 
 Avg number of Democrat Sentences/Speech: 13.07 
 Avg number of Independent Sentences/Speech: 8.62


### Avg Number of Words by Party

In [30]:
AvgRtoks = (train_data.Total_tokens[train_data['Party'] == 'R']).mean()
AvgDtoks = (train_data.Total_tokens[train_data['Party'] == 'D']).mean()
AvgItoks = (train_data.Total_tokens[train_data['Party'] == 'I']).mean()

print(f'Avg number of Republican Words/Speech: {AvgRtoks}', '\n', 
      f'Avg number of Democrat Words/Speech: {AvgDtoks}', '\n', 
      f'Avg number of Independent Words/Speech: {AvgItoks}')

Avg number of Republican Words/Speech: 107.7537688442211 
 Avg number of Democrat Words/Speech: 146.60358146067415 
 Avg number of Independent Words/Speech: 88.1923076923077


In [None]:
# Writing data from dataframes into csv files
train_data.to_csv('Train_speech.csv',index=False)
test_data.to_csv('Test_speech.csv',index=False)

### Convert string of words to Integers

For using the RNN model, we will be changing the string of words to integers since it only reads integers. For text generation, we would be using combination of features and labels. Providing a string of words(Features) and after training it, it would predict next word(label) 

In [31]:
# Extract text from the training dataset
original_train_abstract = list(train_data['Text'])
len(original_train_abstract)

5660

In [38]:
#Create a function to add spaces
import re
def format_abstract(abstract):
    """Add spaces around punctuation"""

    # Add spaces around punctuation
    abstract = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', abstract)
    # Remove double spaces
    abstract = re.sub(r'\s\s', ' ', abstract)
    return abstract

formatted_train = []
# Iterate through all the original text from the training dataset
for a in original_train_abstract:
    formatted_train.append(format_abstract(a))

In [39]:
#Import the tensorflow and Keras library for using tokenizer
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer

In [40]:
# Create a function to generate word to integer mapping with features and labels
def make_sequences(texts,
                   training_length=50,
                   lower=True,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)
   # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')
    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []
    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length:i + 1]

            # Set the features and label
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    print(f'There are {len(training_seq)} training sequences.')

    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels

In [41]:
# Pass the training data to check the unique sequence, here we have taken 50 as length of  a sequence
TRAINING_LENGTH = 50

filters = '!"#$%&()*+/:<=>@[\\]^_`{|}~\t\n'
word_idx_train, idx_word_train, num_words_train, word_counts_train, abstracts_train, sequences_train, features_train, labels_train = make_sequences(
    formatted_train, TRAINING_LENGTH, lower=True, filters=filters)

There are 26060 unique words.
There are 1374135 training sequences.


In [56]:
#Check the word counts and  word to integer mapping
print(word_counts_train)
word_idx_train



{'the': 1,
 '.': 2,
 ',': 3,
 'to': 4,
 'of': 5,
 'and': 6,
 'that': 7,
 'in': 8,
 'a': 9,
 'is': 10,
 'i': 11,
 'this': 12,
 'for': 13,
 'we': 14,
 'it': 15,
 'not': 16,
 'are': 17,
 'have': 18,
 'on': 19,
 'mr': 20,
 'from': 21,
 'be': 22,
 'our': 23,
 'as': 24,
 'bill': 25,
 'with': 26,
 'will': 27,
 'by': 28,
 'they': 29,
 'would': 30,
 'chairman': 31,
 "'s": 32,
 'my': 33,
 'do': 34,
 'has': 35,
 'their': 36,
 'an': 37,
 ';': 38,
 'or': 39,
 'gentleman': 40,
 'speaker': 41,
 'but': 42,
 'time': 43,
 'at': 44,
 'amendment': 45,
 'who': 46,
 'can': 47,
 'was': 48,
 'all': 49,
 'these': 50,
 'what': 51,
 'if': 52,
 'there': 53,
 'committee': 54,
 'people': 55,
 'more': 56,
 'you': 57,
 'about': 58,
 'so': 59,
 'which': 60,
 'been': 61,
 'yield': 62,
 'one': 63,
 'support': 64,
 'us': 65,
 'no': 66,
 'those': 67,
 'because': 68,
 'states': 69,
 'other': 70,
 'new': 71,
 '?': 72,
 'state': 73,
 'when': 74,
 'federal': 75,
 'should': 76,
 'energy': 77,
 'congress': 78,
 'house': 79,
 'b

In [49]:
#Print features by passing any value of n
n = 4
features_train[n][:5]

[11, 171, 8, 303, 4]

In [57]:
#Function to check label based on a feature
def find_label(value):
    """Find label corresponding to features for index in training data"""

    # Find features and label
    feature = ' '.join(idx_word_train[i] for i in features_train[value])
    label = idx_word_train[labels_train[value]]

    print('Features:', feature)
    print('\nLabel: ', label)

In [64]:
# Check labels by inputing the index value and see the generated label
find_label(10)

Features: rules package that we have before us today . it is outrageous that my republican colleagues have placed before us a rules package that at best lacks integrity , and at worst is completely unethical . as the highest body of elected officials in our country , we should be

Label:  held


# Additional analysis using spacy library

### spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.
### Link to explore more about spacy - https://spacy.io/

In [4]:
#!pip install --user spacy
# https://spacy.io/usage/linguistic-features
# mport spacy library
import spacy
import numpy as np

In [7]:
# Reading data from files
train_data = pd.read_csv('Train_speech.csv')[['Text','Party']]
test_data = pd.read_csv('Test_speech.csv')[['Text','Party']]

In [9]:
# Creating dataframe for Democrats and Republic parties
d_train = test_data[test_data['Party']=='D']
r_train = test_data[test_data['Party']=='R']

In [10]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [11]:
nlp.max_length = 4925690
d_text = ' '.join(d_train['Text'])
r_text = ' '.join(r_train['Text'])

In [13]:
nlp.max_length = 1674750
d_doc = nlp(d_text)
r_doc = nlp(r_text)

In [14]:
# all tokens that arent stop words or punctuations
words = [token.text for token in r_doc if token.is_stop != True and token.is_punct != True]

In [15]:
from collections import Counter
# most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(10)
common_words

[('\n', 9187),
 ('mr', 1842),
 ('speaker', 989),
 ('\n ', 866),
 ('act', 808),
 ('bill', 792),
 ('gentleman', 774),
 ('stem', 708),
 ('time', 668),
 ('chairman', 649)]

In [16]:
print("Number of tokens in the document :: ",len(r_doc))
print("Number of sentences in the document :: ",len(list(r_doc.sents)))
print("Number of unique tokens in the document :: ",len(words))

Number of tokens in the document ::  261937
Number of sentences in the document ::  11137
Number of unique tokens in the document ::  116192


In [None]:

for token in r_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    
#Text: The original word text.
#Lemma: The base form of the word.
#POS: The simple UPOS part-of-speech tag.
#Tag: The detailed part-of-speech tag.
#Dep: Syntactic dependency, i.e. the relation between tokens.
#Shape: The word shape – capitalization, punctuation, digits.
#is alpha: Is the token an alpha character?
#is stop: Is the token part of a stop list, i.e. the most common words of the language?

mr mr PROPN NNP dep xx True False
. . PROPN NNP compound . False False
chairman chairman PROPN NNP nsubj xxxx True False
, , PUNCT , punct , False False
i i PRON PRP nsubj x True True
rise rise VERB VBP ccomp xxxx True False
today today NOUN NN npadvmod xxxx True False
in in ADP IN prep xx True True
support support NOUN NN pobj xxxx True False
of of ADP IN prep xx True True
h.r h.r PROPN NNP dep x.x False False
. . PROPN NNP pobj . False False
27 27 NUM CD dep dd False False
, , PUNCT , punct , False False
the the DET DT det xxx True True
job job NOUN NN compound xxx True False
training training NOUN NN compound xxxx True False
improvement improvement PROPN NNP compound xxxx True False
act act PROPN NNP ROOT xxx True False
. . PUNCT . punct . False False

 
 SPACE _SP  
 False False
through through ADP IN prep xxxx True True
local local ADJ JJ amod xxxx True False
and and CCONJ CC cc xxx True True
state state NOUN NN conj xxxx True False
workforce workforce ADJ JJ compound xxxx True Fa