## Convote Dataset - Basic Preprocessing
http://www.cs.cornell.edu/home/llee/data/convote.html


In [2]:
import pandas as pd
import os

In [3]:
# For now, we will use the data from stage one

train_path = ('../convote_v1.1/data_stage_one/training_set/')
test_path = ('../convote_v1.1/data_stage_one/test_set/')
train_file_names = os.listdir(train_path)
test_file_names = os.listdir(test_path)



# Create Dictionary for File Name and Text
file_name_and_text = {}

for file in train_file_names:
    with open(train_path + file, 'r', encoding="utf8") as target_file:
        file_name_and_text[file] = target_file.read()

print(file_name_and_text)
# Structure the dataframe such that the file name is the index
train_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

# This is redundant but fastest given the data is partitioned into directories
file_name_and_text = {}
for file in test_file_names:
    with open(test_path + file, 'r', encoding="utf8") as target_file:
         file_name_and_text[file] = target_file.read()

test_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'File', 0: 'Text'}))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
train_data.head()

Unnamed: 0,File,Text
0,006_400009_0002057_DON.txt,"mr. speaker , i rise in opposition to the rule..."
1,006_400011_0002002_DON.txt,"mr. speaker , i rise for a constitutional poin..."
2,006_400011_0002003_DMN.txt,"mr. speaker , the resolution we are preparing ..."
3,006_400011_0002007_DON.txt,"mr. speaker , consistent with the oath of offi..."
4,006_400011_0002008_DON.txt,"mr. speaker , on that i demand the yeas and na..."


In [5]:
test_data.head()

Unnamed: 0,File,Text
0,048_400008_0296010_DON.txt,"mr. chairman , i thank the gentleman from mich..."
1,048_400008_0297068_DON.txt,"mr. chairman , i thank the gentleman for yield..."
2,048_400009_0297022_DMN.txt,"mr. chairman , i rise in opposition to h.r. 27..."
3,048_400027_0297016_RMY.txt,"mr. chairman , i rise today in support of h.r...."
4,048_400029_0294001_ROY.txt,"mr. speaker , by direction of the committee on..."


In [6]:
# Remove file extension
train_data['File'] = train_data['File'].map(lambda x: x.replace('.txt', ''))
test_data['File'] = test_data['File'].map(lambda x: x.replace('.txt', ''))

In [7]:
# Add Label(R/D) to each text
Label = []

for i in train_data.File:
    Label.append(i[-3:])
    
train_data['Label'] = Label

In [19]:
train_data['Party_Indicator'] = train_data['Label'].str.slice(0, 1)
train_data['Bill_Indicator'] = train_data['Label'].str.slice(1, 2)
train_data['Voter_Indicator'] = train_data['Label'].str.slice(2, 3)

In [20]:
train_data

Unnamed: 0,File,Text,Label,Party_Indicator,Bill_Indicator,Voter_Indicator
0,006_400009_0002057_DON,"mr. speaker , i rise in opposition to the rule...",DON,D,O,N
1,006_400011_0002002_DON,"mr. speaker , i rise for a constitutional poin...",DON,D,O,N
2,006_400011_0002003_DMN,"mr. speaker , the resolution we are preparing ...",DMN,D,M,N
3,006_400011_0002007_DON,"mr. speaker , consistent with the oath of offi...",DON,D,O,N
4,006_400011_0002008_DON,"mr. speaker , on that i demand the yeas and na...",DON,D,O,N
...,...,...,...,...,...,...
5655,645_400263_3228050_DON,"mr. speaker , on that i demand the yeas and na...",DON,D,O,N
5656,645_400276_3228014_ROY,"i thank the gentleman for yielding , and i ris...",ROY,R,O,Y
5657,645_400315_3228005_ROY,i thank the chairman for yielding . \nas a mem...,ROY,R,O,Y
5658,645_400433_3228036_ROY,"mr. speaker , i am here today in support of th...",ROY,R,O,Y


In [9]:
test_data

Unnamed: 0,File,Text
0,048_400008_0296010_DON,"mr. chairman , i thank the gentleman from mich..."
1,048_400008_0297068_DON,"mr. chairman , i thank the gentleman for yield..."
2,048_400009_0297022_DMN,"mr. chairman , i rise in opposition to h.r. 27..."
3,048_400027_0297016_RMY,"mr. chairman , i rise today in support of h.r...."
4,048_400029_0294001_ROY,"mr. speaker , by direction of the committee on..."
...,...,...
1754,599_400328_2990023_ROY,"mr. speaker , i yield myself such time as i ma..."
1755,599_400328_2990025_ROY,i yield to the gentleman from florida . \n
1756,599_400328_2990027_ROY,"mr. speaker , reclaiming my time , the gentlem..."
1757,599_400328_2990029_ROY,"mr. speaker , i yield back the balance of my t..."


In [10]:
# The test data has some Stage 2 files, not sure why - but I labeled the text around this
Label = []
for i in test_data.File:
    label = i.rstrip('0123456789.- ')
    Label.append(label[-3:])

test_data['Label'] = Label

In [21]:
test_data['Party_Indicator'] = test_data['Label'].str.slice(0, 1)
test_data['Bill_Indicator'] = test_data['Label'].str.slice(1, 2)
test_data['Voter_Indicator'] = test_data['Label'].str.slice(2, 3)
test_data

Unnamed: 0,File,Text,Label,Party_Indicator,Bill_Indicator,Voter_Indicator
0,048_400008_0296010_DON,"mr. chairman , i thank the gentleman from mich...",DON,D,O,N
1,048_400008_0297068_DON,"mr. chairman , i thank the gentleman for yield...",DON,D,O,N
2,048_400009_0297022_DMN,"mr. chairman , i rise in opposition to h.r. 27...",DMN,D,M,N
3,048_400027_0297016_RMY,"mr. chairman , i rise today in support of h.r....",RMY,R,M,Y
4,048_400029_0294001_ROY,"mr. speaker , by direction of the committee on...",ROY,R,O,Y
...,...,...,...,...,...,...
1754,599_400328_2990023_ROY,"mr. speaker , i yield myself such time as i ma...",ROY,R,O,Y
1755,599_400328_2990025_ROY,i yield to the gentleman from florida . \n,ROY,R,O,Y
1756,599_400328_2990027_ROY,"mr. speaker , reclaiming my time , the gentlem...",ROY,R,O,Y
1757,599_400328_2990029_ROY,"mr. speaker , i yield back the balance of my t...",ROY,R,O,Y


### There are two labeled dataframes; the next steps will be text preprocessing (i.e. tokenization, stemming, etc)

In [22]:
train_data.to_csv('Train_speech.csv',index=False)
test_data.to_csv('Test_speech.csv',index=False)