In [79]:
import pandas as pd
import pickle as pkl
from collections import Counter
import os
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split

# Original Dataset with 3 valid classes

## Labels in the original dataset
0 -> Statement <br>
1 -> Interrogative <br>
2 -> Imperative <br>
3 -> Do not use it / Corrupted label <br>

In [2]:
labels = []
text = []

with open('../data/input/kevin/trn_data.csv', 'r') as f:
    for line in f.readlines():
        values = line.split('\t', 1)
        labels.append(values[0])
        text_val = values[1]
        text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')
        text.append(text_val)

In [3]:
df_data = pd.DataFrame(list(zip(text, labels)), columns=['text', 'label'])

In [4]:
df_data.head()

Unnamed: 0,text,label
0,A cockroach will live nine days without it’s h...,0
1,More people are killed each year from bees tha...,0
2,"Well i-, well it seemed to make sense since I ...",0
3,"So, I have none left what so ever.",0
4,You have you have a lot of younger brothers an...,0


In [5]:
df_data.describe()

Unnamed: 0,text,label
count,1388,1388
unique,1382,3
top,oh no.,0
freq,2,664


In [6]:
labels = []
text = []

with open('../data/input/kevin/tst_data.csv', 'r') as f:
    for line in f.readlines():
        values = line.split('\t', 1)
        labels.append(values[0])
        text_val = values[1]
        text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')
        text.append(text_val)

In [7]:
df_data2 = pd.DataFrame(list(zip(text, labels)), columns=['text', 'label'])

In [8]:
df_data2.describe()

Unnamed: 0,text,label
count,298,298
unique,297,4
top,Why.,2
freq,2,85


In [9]:
df_data2.head()

Unnamed: 0,text,label
0,Like if she'll ask me where are my crayons whe...,0
1,Ketchup was sold in the 1830s as medicine.,0
2,I need some water.,0
3,yeah yeah.,0
4,"kind of telling him that, you know, from my re...",0


In [10]:
df_combined = df_data.append(df_data2, ignore_index=True)

In [11]:
df_combined

Unnamed: 0,text,label
0,A cockroach will live nine days without it’s h...,0
1,More people are killed each year from bees tha...,0
2,"Well i-, well it seemed to make sense since I ...",0
3,"So, I have none left what so ever.",0
4,You have you have a lot of younger brothers an...,0
...,...,...
1681,"CHANGE THE PROCESS, PPL.",3
1682,You guys do everything wonderful.,3
1683,I'm so frustrated.,3
1684,You will not be disappointed.,3


In [12]:
counter_rule1 = Counter(df_combined['label'])
print(counter_rule1)

Counter({'0': 735, '2': 466, '1': 419, '3': 66})


In [13]:
df_combined.loc[df_combined['label'] == '0'].head()

Unnamed: 0,text,label
0,A cockroach will live nine days without it’s h...,0
1,More people are killed each year from bees tha...,0
2,"Well i-, well it seemed to make sense since I ...",0
3,"So, I have none left what so ever.",0
4,You have you have a lot of younger brothers an...,0


In [14]:
df_combined.loc[df_combined['label'] == '1'].head()

Unnamed: 0,text,label
350,oppressive and hierarchical system.,1
351,Barack Obama is how old.,1
352,daily call options.,1
353,Who's coming.,1
354,night to meet her or do you just think they'd ...,1


In [15]:
df_combined.loc[df_combined['label'] == '2'].head()

Unnamed: 0,text,label
693,Find a sturdy piece of cardboard in the form o...,2
694,Stand up for yourself.,2
695,"Fix out priorities together in a meeting, a co...",2
696,Make one last snowball for the penguin's head.,2
697,Look for the internet venue you will use for y...,2


In [16]:
df_combined.loc[df_combined['label'] == '3'].head()

Unnamed: 0,text,label
1620,I came in to town for a week and forgot my tra...,3
1621,You.,3
1622,"I'll drive an hour just for their volcano, yum.",3
1623,He knows his bees.,3
1624,That's the best bakery in town.,3


### Remove class '3' since they are corrupted labels

In [17]:
df_combined = df_combined.drop(df_combined[df_combined['label'] == '3'].index)

# Imperative dataset

In [18]:
text = []
with open('../data/input/imperatives/ground_truth/imperatives.csv', 'r') as f:
    for line in f.readlines():
        text_val = line.strip()
        text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')
        text.append(text_val)

### Assign label 2 for imperatives

In [19]:
imp_label = ['2' for i in text]

In [20]:
df_imperatives = pd.DataFrame(zip(text, imp_label), columns=['text', 'label'])

In [21]:

df_imperatives.head()

Unnamed: 0,text,label
0,Find a sturdy piece of cardboard in the form o...,2
1,Stand up for yourself,2
2,"Fix out priorities together in a meeting, a co...",2
3,Make one last snowball for the penguin's head,2
4,Look for the internet venue you will use for y...,2


## Combine original dataset with imperative to create df_gt

In [22]:
df_gt = df_combined.append(df_imperatives, ignore_index=True)

In [23]:
label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'2': 1636, '0': 735, '1': 419})


# Imperative Dataset - BBT

Source: https://github.com/yiminxsisu/TV-AfD_Imperative_Corpus

The columns respectively are: data source (in the formant of nxmm with 'n' indicating the season of the show and 'mm' representing the episode), text, imperative classification (1 for imperative and 0 for non-imperative), imperative category, and whether the imperative sentence has affixal negative markers or not.

In [24]:
def replace_bbt_text(text):
    text = text.replace('2x01','').replace('2x02','').replace('2x03','').replace('2x04','').replace('2x05','')\
                .replace('2x06','').replace('2x07','').replace('2x08','').replace('2x09','').replace('2x10','')\
                .replace('2x11','').replace('2x12','').replace('2x13','').replace('2x14','').replace('2x15','')\
                .replace('2x16','').replace('2x17','').replace('2x18','').replace('2x19','').replace('2x20','')\
                .replace('2x21','').replace('2x22','').replace('2x23','').replace('2x24','').replace('2x25','')\
                .strip()
    
    text = text.replace('3x01','').replace('3x02','').replace('3x03','').replace('3x04','').replace('3x05','')\
                .replace('3x06','').replace('3x07','').replace('3x08','').replace('3x09','').replace('3x10','')\
                .replace('3x11','').replace('3x12','').replace('3x13','').replace('3x14','').replace('3x15','')\
                .replace('3x16','').replace('3x17','').replace('3x18','').replace('3x19','').replace('3x20','')\
                .replace('3x21','').replace('3x22','').replace('3x23','').replace('3x24','').replace('3x25','')\
                .strip()
        
    text = text.replace('4x01','').replace('4x02','').replace('4x03','').replace('4x04','').replace('4x05','')\
                .replace('4x06','').replace('4x07','').replace('4x08','').replace('4x09','').replace('4x10','')\
                .replace('4x11','').replace('4x12','').replace('4x13','').replace('4x14','').replace('4x15','')\
                .replace('4x16','').replace('4x17','').replace('4x18','').replace('4x19','').replace('4x20','')\
                .replace('4x21','').replace('4x22','').replace('4x23','').replace('4x24','').replace('4x25','')\
                .strip()
    
    return text

In [25]:
text = []
labels = []

In [26]:
input_dir = '../data/input/imperatives_bbt/season_2_labeled/'
only_files = [f for f in listdir(input_dir) if (isfile(join(input_dir, f)) and str(f).endswith('.txt'))]

for file in only_files:
    
    with open(os.path.join(input_dir, file), 'r') as f:
        for line in f.readlines():            
            line = replace_bbt_text(line)
            values = line.split('\t')
            
            if (len(values) >= 2):

                text_val = values[0]
                text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                label = values[1]
                label = label.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                text.append(text_val)
                labels.append(label)
            
print(len(text), len(labels))

9706 9706


In [27]:
input_dir = '../data/input/imperatives_bbt/season_3_labeled/'
only_files = [f for f in listdir(input_dir) if (isfile(join(input_dir, f)) and str(f).endswith('.txt'))]

for file in only_files:
    
    with open(os.path.join(input_dir, file), 'r') as f:
        for line in f.readlines():            
            line = replace_bbt_text(line)
            values = line.split('\t')
            
            if (len(values) >= 2):

                text_val = values[0]
                text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                label = values[1]
                label = label.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                text.append(text_val)
                labels.append(label)
            
print(len(text), len(labels))

17599 17599


In [28]:
input_dir = '../data/input/imperatives_bbt/season_4_labeled/'
only_files = [f for f in listdir(input_dir) if (isfile(join(input_dir, f)) and str(f).endswith('.txt'))]

for file in only_files:
    
    with open(os.path.join(input_dir, file), 'r') as f:
        for line in f.readlines():            
            line = replace_bbt_text(line)
            values = line.split('\t')
            
            if (len(values) >= 2):

                text_val = values[0]
                text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                label = values[1]
                label = label.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

                text.append(text_val)
                labels.append(label)
            
print(len(text), len(labels))

25093 25093


In [29]:
df_bbt = pd.DataFrame(list(zip(text, labels)), columns=['text', 'label'])

In [30]:
label_counter = Counter(df_bbt['label'])
print(label_counter)

Counter({'0': 22388, '1': 2148, '': 7, '  0': 3, 'bazinga': 2, 'you always skip': 2, 'problem': 2, 'come in': 2, 'front of each other': 1, 'for taking you to disneyland': 1, 'with something more suited to my pet needs': 1, 'premature menopause, hosting an alien parasite or and i only include it for the sake of covering': 1, 'of roasted potatoes and carrots': 1, "it's going to be you": 1, 'ride the matterhorn': 1, 'smell that': 1, 'alone time': 1, 'ignored the pleas of a damsel in distress': 1, 'you': 1, 'are very happy living together': 1, 'watching': 1, 'minutes': 1, 'with a lessthan': 1, "okay, babydoll pink, let's see if you can cover up the fact that i got my dad's feet": 1, "with a waitress who lives across the hall it ended as inexplicably as it began they had very little in common, except for carnal activity that's why i acquired": 1, 'to visit a center three times a week': 1, 'great jenga tantrum of': 1, 'the alien parasite hypothesis a fair shake': 1, 'an interesting topic': 1

### Imperatives Labels
'1' --> Imperatives <br>
'0' --> Non-imperatives <br>
Others --> Skip them <br>

In [31]:
df_bbt_imperatives = df_bbt.loc[df_bbt['label'] == '1']
df_bbt_imperatives['label'] = '2'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
df_bbt_imperatives.head()

Unnamed: 0,text,label
16,let's stay serious here,2
35,"no, not all at once",2
37,leonard,2
38,"now, raj",2
39,"now, sheldon",2


In [33]:
df_bbt_imperatives.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2148 entries, 16 to 25090
Data columns (total 2 columns):
text     2148 non-null object
label    2148 non-null object
dtypes: object(2)
memory usage: 50.3+ KB


### Combine df_gt and the df_bbt_imperatives

In [34]:
df_gt = df_gt.append(df_bbt_imperatives, ignore_index=True)

In [35]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4938 entries, 0 to 4937
Data columns (total 2 columns):
text     4938 non-null object
label    4938 non-null object
dtypes: object(2)
memory usage: 77.3+ KB


In [36]:
label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'2': 3784, '0': 735, '1': 419})


## Imperatives - Wiki dataset

In [37]:
ip_file = '../data/input/imperatives_wiki/wiki_imperative.txt'

text = []
labels = []

with open(ip_file, 'r') as f:
    for line in f.readlines():            
        line = line.strip()
        values = line.split('\t')

        if (len(values) >= 2):

            text_val = values[0]
            text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

            label = values[1]
            label = label.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')

            text.append(text_val)
            labels.append(label)
            
print(len(text), len(labels))

10624 10624


In [38]:
df_wiki = pd.DataFrame(list(zip(text, labels)), columns=['text', 'label'])

In [39]:
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10624 entries, 0 to 10623
Data columns (total 2 columns):
text     10624 non-null object
label    10624 non-null object
dtypes: object(2)
memory usage: 166.1+ KB


In [40]:
label_counter = Counter(df_wiki['label'])
print(label_counter)

Counter({'1': 8497, '0': 2127})


In [41]:
df_wiki_imperatives = df_wiki.loc[df_wiki['label'] == '1']
df_wiki_imperatives['label'] = '2'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
df_wiki_imperatives.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8497 entries, 0 to 10623
Data columns (total 2 columns):
text     8497 non-null object
label    8497 non-null object
dtypes: object(2)
memory usage: 199.1+ KB


### Combine df_gt and the df_wiki_imperatives

In [43]:
df_gt = df_gt.append(df_wiki_imperatives, ignore_index=True)

In [44]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13435 entries, 0 to 13434
Data columns (total 2 columns):
text     13435 non-null object
label    13435 non-null object
dtypes: object(2)
memory usage: 210.0+ KB


In [45]:
label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'2': 12281, '0': 735, '1': 419})


## WikiQA-train - Sentence & Question data
Source: https://www.microsoft.com/en-us/download/details.aspx?id=52419&from=http%3A%2F%2Fresearch.microsoft.com%2Fapps%2Fmobile%2Fdownload.aspx%3Fp%3D4495da01-db8c-4041-a7f6-7984a4f6a905

In [None]:
# Extracting questions
ip_file = '../data/input/WikiQACorpus/WikiQA-train.txt'
questions = []

with open(ip_file, 'r') as f:
    for line in f.readlines():            
        line = line.strip()
        values = line.split('\t')

        if (len(values) >= 2):

            text_val = values[0]
            text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')
            questions.append(text_val)

questions = list(set(questions))

print(len(questions))

In [47]:
que_label = ['1' for i in questions]
df_wikiqa_que = pd.DataFrame(list(zip(questions, que_label)), columns=['text', 'label'])
df_gt = df_gt.append(df_wikiqa_que, ignore_index=True)

label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'2': 12281, '1': 2537, '0': 735})


In [48]:
# Extracting statements
ip_file = '../data/input/WikiQACorpus/WikiQA-train.txt'
statements = []

with open(ip_file, 'r') as f:
    for line in f.readlines():            
        line = line.strip()
        values = line.split('\t')

        if (len(values) >= 2):

            text_val = values[1]
            text_val = text_val.replace('\t', '').replace('\n', '').replace('  ', ' ').replace('""','')
            statements.append(text_val)

statements = list(set(statements))

print(len(statements))

18821


In [49]:
statements

['It has been economically infeasible for many years , but with rising oil prices , more companies are investing in this area .',
 'This is a complete list of congressional districts for representation in the United States House of Representatives .',
 'This is the measure used in most parts of the world .',
 'In 2003 , Rolling Stone Magazine listed his box set All Killer , No Filler : The Anthology number 242 on their list of `` 500 greatest albums of all time `` .',
 "On that day , at St. John 's Church in Richmond , Virginia , and is credited with having swung the balance in convincing the Virginia House of Burgesses to pass a resolution delivering the Virginia troops to the Revolutionary War .",
 'The relationship between percentage changes and basis points can be summarized as follows : 1 percentage point change = 100 basis points , and 0.01 percentage points = 1 basis point .',
 "With 11 national titles , UCLA has the record for the most NCAA Men 's Division I Basketball Champion

In [50]:
statement_label = ['0' for i in statements]
df_wikiqa_statement = pd.DataFrame(list(zip(statements, statement_label)), columns=['text', 'label'])
df_gt = df_gt.append(df_wikiqa_statement, ignore_index=True)

label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'0': 19556, '2': 12281, '1': 2537})


# Quora questions dataset

### Read question datasets

In [51]:
df_q1 = pd.read_csv('../data/input/quora_questions_train.csv')

In [52]:
df_q1.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [53]:
df_q1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [54]:
questions = df_q1['question1'][0:12000]
que_label = ['1' for i in questions]
df_quora_que = pd.DataFrame(list(zip(questions, que_label)), columns=['text', 'label'])

In [55]:
# Run this cell only if you want more than ~800k questions

# df_q2 = pd.read_csv('../data/input/quora_questions_test.csv')
# df_q2.head()

In [56]:
# df_q2.info()

## Combine original dataset with questions in df_gt

In [57]:
df_gt = df_gt.append(df_quora_que, ignore_index=True)

In [60]:
label_counter = Counter(df_gt['label'])
print(label_counter)

Counter({'0': 19556, '1': 14537, '2': 12281})


In [74]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46374 entries, 0 to 46373
Data columns (total 2 columns):
text     46374 non-null object
label    46374 non-null object
dtypes: object(2)
memory usage: 724.7+ KB


# Superficial preprocessing

In [66]:
import re

def preprocess_text(text):
    
    # print(text, end ='')
    
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
        
    #words = text.split(' ')
    #words = [w.strip() for w in words if w not in stopwords.words('english')]
    #text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [68]:
df_gt['text'] = df_gt['text'].apply(lambda x: preprocess_text(x))

## Metadata preprocessing

In [75]:
df_gt = df_gt.astype({"text": str, "label": int})

In [76]:
df_gt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46374 entries, 0 to 46373
Data columns (total 2 columns):
text     46374 non-null object
label    46374 non-null int64
dtypes: int64(1), object(1)
memory usage: 724.7+ KB


## Divide into train and test data

In [82]:
X = df_gt['text']
y = df_gt['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

df_train = pd.DataFrame(list(zip(X_train.values, y_train.values)), columns=['text', 'label'])
df_test = pd.DataFrame(list(zip(X_test.values, y_test.values)), columns=['text', 'label'])

In [84]:
label_counter = Counter(df_train['label'])
print(label_counter)

label_counter = Counter(df_test['label'])
print(label_counter)

Counter({0: 15645, 1: 11629, 2: 9825})
Counter({0: 3911, 1: 2908, 2: 2456})


# Write the groundtruth df_gt to .csv file

In [85]:
# Labels
# 0 --> Statements
# 1 --> Questions
# 2 --> Imperatives

op_file = '../data/input/groundtruth/speechact_gt.csv'
df_gt.to_csv(op_file, index=False, header=['text', 'label'])

op_file = '../data/input/groundtruth/speechact_train.csv'
df_train.to_csv(op_file, index=False, header=['text', 'label'])

op_file = '../data/input/groundtruth/speechact_test.csv'
df_test.to_csv(op_file, index=False, header=['text', 'label'])