In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import sklearn
import nltk
from nltk.corpus import stopwords 

In [2]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

In [3]:
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

In [4]:
df_train_augmented = pd.DataFrame(columns=[text_col_header, label_col_header])
df_train_augmented = pd.concat([df_train_augmented, df_train], axis=0, sort=False)

print('Augmented dataframe', df_train_augmented.shape)
print(Counter(df_train_augmented[label_col_header]))

Augmented dataframe (37099, 2)
Counter({0: 15645, 1: 11629, 2: 9825})


#### Read augmented data files

In [5]:
class_label = 0
ip_aug_file = os.path.join(INPUT_DIR, 'gpt2/augmented_class'+str(class_label)+'_statement.csv')

df_aug_class = pd.read_csv(ip_aug_file, sep='\t')
print('Dataframe shape:',df_aug_class.shape)

df_aug_class = df_aug_class.loc[df_aug_class['is_agumented'] == 1]
print('Only augmented dataframe shape:',df_aug_class.shape)

text = np.unique(df_aug_class['text'].values)
print('Number of unique values:', len(text))

labels = [class_label] * len(text)
print('Label size', len(labels))

df_aug_class = pd.DataFrame(zip(text, labels), columns=[text_col_header, label_col_header])
print(df_aug_class.head())

df_train_augmented = pd.concat([df_train_augmented, df_aug_class], axis=0, sort=False)
print('Augmented dataframe', df_train_augmented.shape)

print(Counter(df_train_augmented[label_col_header]))

Dataframe shape: (1331, 2)
Only augmented dataframe shape: (1210, 2)
Number of unique values: 1078
Label size 1078
                                                text  label
0  a carnivore meaning a flesh eater in carnivoro...      0
1  a carnivore meaning a flesh eater or eater of ...      0
2  a carnivore meaning a flesh eater or eatery in...      0
3  a carnivore meaning a flesh eater or eatery in...      0
4  a carnivore meaning a flesh eater or eatery in...      0
Augmented dataframe (38177, 2)
Counter({0: 16723, 1: 11629, 2: 9825})


In [6]:
class_label = 1
ip_aug_file = os.path.join(INPUT_DIR, 'gpt2/augmented_class'+str(class_label)+'_interrogative.csv')

df_aug_class = pd.read_csv(ip_aug_file, sep='\t')
print('Dataframe shape:',df_aug_class.shape)

df_aug_class = df_aug_class.loc[df_aug_class['is_agumented'] == 1]
print('Only augmented dataframe shape:',df_aug_class.shape)

text = np.unique(df_aug_class['text'].values)
print('Number of unique values:', len(text))

labels = [class_label] * len(text)
print('Label size', len(labels))

df_aug_class = pd.DataFrame(zip(text, labels), columns=[text_col_header, label_col_header])
print(df_aug_class.head())

df_train_augmented = pd.concat([df_train_augmented, df_aug_class], axis=0, sort=False)
print('Augmented dataframe', df_train_augmented.shape)

print(Counter(df_train_augmented[label_col_header]))

Dataframe shape: (968, 2)
Only augmented dataframe shape: (880, 2)
Number of unique values: 718
Label size 718
                                text  label
0         alien abductees of chicago      1
1           alien abductees of china      1
2  alien abductees of chinese origin      1
3       alien abductees of hiroshima      1
4     alien abductees of kurukshetra      1
Augmented dataframe (38895, 2)
Counter({0: 16723, 1: 12347, 2: 9825})


In [7]:
class_label = 2
ip_aug_file = os.path.join(INPUT_DIR, 'gpt2/augmented_class'+str(class_label)+'_imperative.csv')

df_aug_class = pd.read_csv(ip_aug_file, sep='\t')
print('Dataframe shape:',df_aug_class.shape)

df_aug_class = df_aug_class.loc[df_aug_class['is_agumented'] == 1]
print('Only augmented dataframe shape:',df_aug_class.shape)

text = np.unique(df_aug_class['text'].values)
print('Number of unique values:', len(text))

labels = [class_label] * len(text)
print('Label size', len(labels))

df_aug_class = pd.DataFrame(zip(text, labels), columns=[text_col_header, label_col_header])
print(df_aug_class.head())

df_train_augmented = pd.concat([df_train_augmented, df_aug_class], axis=0, sort=False)
print('Augmented dataframe', df_train_augmented.shape)

print(Counter(df_train_augmented[label_col_header]))

Dataframe shape: (1474, 2)
Only augmented dataframe shape: (1340, 2)
Number of unique values: 874
Label size 874
                                                text  label
0  actually not clear on the exact content of the...      2
1  actually not clear that such an article would ...      2
2  actually not clear that the corresponding nota...      2
3  actually not clear that the coverage that we h...      2
4  actually not clear that the inclusion criteria...      2
Augmented dataframe (39769, 2)
Counter({0: 16723, 1: 12347, 2: 10699})


In [8]:
op_file = os.path.join(OUTPUT_DIR,'speechact_augmented_gpt2_train.csv')

In [9]:
df_train_augmented.to_csv(op_file, index=False, header=[text_col_header, label_col_header])

## If the data is augmented text train file is available, run from the below cells

In [10]:
df_train_augmented = pd.read_csv(op_file)

In [11]:
df_train.shape

(37099, 2)

In [12]:
df_train = df_train_augmented.copy()

In [13]:
df_train.shape

(39769, 2)

## Preprocess data

In [14]:
def preprocess_text(text, remove_stopwords=False):
    
    text = str(text)
    
    # print(text, len(text), end ='\n')
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
    
    if remove_stopwords:
        words = text.split(' ')
        words = [w.strip() for w in words if w not in stopwords.words('english')]
        text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [15]:
def remove_words_based_on_occurences(text_list, min_occurences=1):
    
    corpus = []
    
    alltext = ''
    for text in text_list:
        alltext += text
        
    wordlist = alltext.split()

    word_freq = dict(Counter(wordlist))
    # print(word_freq)
    
    for word,freq in word_freq.items():
        if (freq >= min_occurences):
            corpus.append(word)
    
    return corpus

In [16]:
def preprocess_df(df_data, text_col_header, remove_stopwords):
    
    df_data[text_col_header] = df_data[text_col_header].apply(lambda x: preprocess_text(x,remove_stopwords))
    df_data.drop(df_data[df_data[text_col_header] == ''].index, inplace=True)
    
    return df_data

### Creating multiple corpus

In [17]:
remove_stopwords = False
text_col_header = 'text'

df_train_stopword = df_train.copy()
df_train_stopword = preprocess_df(df_train_stopword, text_col_header, remove_stopwords)

# df_test_stopword = df_test.copy()
# df_test_stopword = preprocess_df(df_test_stopword, text_col_header, remove_stopwords)

corpus_stopword = df_train_stopword[text_col_header].values

print('Corpus Length ', len(corpus_stopword))

corpus_file = os.path.join(CORPUS_DIR, 'aug_gpt2_corpus_stopword.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_stopword, f_op)

Corpus Length  39769


In [18]:
remove_stopwords = True
text_col_header = 'text'

df_train_nostopword = df_train.copy()
df_train_nostopword = preprocess_df(df_train_nostopword, text_col_header, remove_stopwords)

# df_test_nostopword = df_test.copy()
# df_test_nostopword = preprocess_df(df_test_nostopword, text_col_header, remove_stopwords)

corpus_nostopword = df_train_nostopword[text_col_header].values

print('Corpus Length ', len(corpus_nostopword))

corpus_file = os.path.join(CORPUS_DIR, 'aug_gpt2_corpus_nostopword.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_nostopword, f_op)

Corpus Length  39688


### Create optional corpus with only minimum occurring words

In [19]:
corpus_min_occurrences = remove_words_based_on_occurences(corpus_nostopword, min_occurences=2)

corpus_file = os.path.join(CORPUS_DIR, 'aug_gpt2_corpus_min_occurrences_2.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_min_occurrences, f_op)

# Vectorization - Word embedding

## There are 3 corpus that are to be vectorized
  <li> Bag of words </li>
  <li> Tf-IDF </li>
  <li> Word embeddings - Glove </li>
  <li> BERT embeddings </li>

### Count Vectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

#### Corpus - with stopwords

In [21]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_stopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_countvector_stopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_countvector_stopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

# data_test = vectorizer.transform(df_test_stopword[text_col_header])
# print('Shape of the data train:',data_test.shape)
# label_test = np.array(df_test_stopword[label_col_header])
# label_test = label_test.reshape((len(label_test), 1))

Shape of the data train: (39769, 34654)
Shape of the label train: (39769, 1)


#### Corpus - without stopwords

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_nostopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_countvector_nostopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_countvector_nostopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

#### Corpus - minimum occurrence words

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_min_occurrences)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_countvector_minoccurences.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_countvector_minoccurences.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

### Tf-Idf Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Corpus - with stopwords

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_stopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_tfidfvector_stopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_tfidfvector_stopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_tfidfvector_stopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

#### Corpus - without stopwords

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_nostopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_tfidfvector_nostopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_tfidfvector_nostopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_tfidfvector_nostopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

#### Corpus - minimum occurrence words

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_min_occurrences)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'aug_gpt2_vector_tfidfvector_minoccurences_2.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_data_tfidfvector_minoccurences.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'aug_gpt2_train_label_tfidfvector_minoccurences.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)