In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import sklearn
import nltk
from nltk.corpus import stopwords 

In [2]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

In [3]:
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

In [4]:
df_train.head()

Unnamed: 0,text,label
0,marketers representing industry raid manufactu...,0
1,rolling stone magazine said richards had creat...,0
2,most designs are inflated through pyrotechnic ...,0
3,please focus on the article s topic not the title,2
4,other types of arthropod produce silk most not...,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37099 entries, 0 to 37098
Data columns (total 2 columns):
text     37099 non-null object
label    37099 non-null int64
dtypes: int64(1), object(1)
memory usage: 579.8+ KB


In [6]:
df_test.head()

Unnamed: 0,text,label
0,there are many types of waxing suitable for re...,0
1,give the article a chance,2
2,gimme a couple hours and ill nom it myself if ...,2
3,in roman catholicism the baptism of jesus is o...,0
4,see notability,2


## Preprocess data

In [7]:
def preprocess_text(text, remove_stopwords=False):
    
    text = str(text)
    
    # print(text, len(text), end ='\n')
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
    
    if remove_stopwords:
        words = text.split(' ')
        words = [w.strip() for w in words if w not in stopwords.words('english')]
        text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [8]:
def remove_words_based_on_occurences(text_list, min_occurences=1):
    
    corpus = []
    
    alltext = ''
    for text in text_list:
        alltext += text
        
    wordlist = alltext.split()

    word_freq = dict(Counter(wordlist))
    # print(word_freq)
    
    for word,freq in word_freq.items():
        if (freq >= min_occurences):
            corpus.append(word)
    
    return corpus

In [9]:
def preprocess_df(df_data, text_col_header, remove_stopwords):
    
    df_data[text_col_header] = df_data[text_col_header].apply(lambda x: preprocess_text(x,remove_stopwords))
    df_data.drop(df_data[df_data[text_col_header] == ''].index, inplace=True)
    
    return df_data

### Creating multiple corpus

In [10]:
remove_stopwords = False
text_col_header = 'text'

df_train_stopword = df_train.copy()
df_train_stopword = preprocess_df(df_train_stopword, text_col_header, remove_stopwords)

# df_test_stopword = df_test.copy()
# df_test_stopword = preprocess_df(df_test_stopword, text_col_header, remove_stopwords)

corpus_stopword = df_train_stopword[text_col_header].values

print('Corpus Length ', len(corpus_stopword))

corpus_file = os.path.join(CORPUS_DIR, 'corpus_stopword.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_stopword, f_op)

Corpus Length  37099


In [11]:
remove_stopwords = True
text_col_header = 'text'

df_train_nostopword = df_train.copy()
df_train_nostopword = preprocess_df(df_train_nostopword, text_col_header, remove_stopwords)

# df_test_nostopword = df_test.copy()
# df_test_nostopword = preprocess_df(df_test_nostopword, text_col_header, remove_stopwords)

corpus_nostopword = df_train_nostopword[text_col_header].values

print('Corpus Length ', len(corpus_nostopword))

corpus_file = os.path.join(CORPUS_DIR, 'corpus_nostopword.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_nostopword, f_op)

Corpus Length  37020


### Create optional corpus with only minimum occurring words

In [12]:
corpus_min_occurrences = remove_words_based_on_occurences(corpus_nostopword, min_occurences=2)

corpus_file = os.path.join(CORPUS_DIR, 'corpus_min_occurrences_2.pkl')
with open(corpus_file, 'wb') as f_op:
    pkl.dump(corpus_min_occurrences, f_op)

# Vectorization - Word embedding

## There are 3 corpus that are to be vectorized
  <li> Bag of words </li>
  <li> Tf-IDF </li>
  <li> Word embeddings - Glove </li>
  <li> BERT embeddings </li>

### Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

#### Corpus - with stopwords

In [14]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_stopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_stopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_stopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

# data_test = vectorizer.transform(df_test_stopword[text_col_header])
# print('Shape of the data train:',data_test.shape)
# label_test = np.array(df_test_stopword[label_col_header])
# label_test = label_test.reshape((len(label_test), 1))

Shape of the data train: (37099, 34154)
Shape of the label train: (37099, 1)


#### Corpus - without stopwords

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_nostopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_nostopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_nostopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

Shape of the data train: (37099, 34014)
Shape of the label train: (37099, 1)


#### Corpus - minimum occurrence words

In [16]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus_min_occurrences)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_minoccurences.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_minoccurences.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

Shape of the data train: (37099, 16518)
Shape of the label train: (37099, 1)


### Tf-Idf Vectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Corpus - with stopwords

In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_stopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_tfidfvector_stopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_tfidfvector_stopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_tfidfvector_stopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

Shape of the data train: (37099, 34154)
Shape of the label train: (37099, 1)


#### Corpus - without stopwords

In [21]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_nostopword)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_tfidfvector_nostopword.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_tfidfvector_nostopword.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_tfidfvector_nostopword.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

Shape of the data train: (37099, 34014)
Shape of the label train: (37099, 1)


#### Corpus - minimum occurrence words

In [22]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_min_occurrences)

# Write the vectorizer itself
vector_file = os.path.join(CORPUS_DIR, 'vector_tfidfvector_minoccurences_2.pkl')
with open(vector_file, 'wb') as f_op:
    pkl.dump(vectorizer, f_op)

data_train = vectorizer.transform(df_train_stopword[text_col_header])
print('Shape of the data train:',data_train.shape)
label_train = np.array(df_train_stopword[label_col_header])
label_train = label_train.reshape((len(label_train), 1))
print('Shape of the label train:',label_train.shape)

vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_tfidfvector_minoccurences.pkl')
with open(vectorized_train_data_file, 'wb') as f_op:
    pkl.dump(data_train, f_op)
    
vectorized_train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_tfidfvector_minoccurences.pkl')
with open(vectorized_train_label_file, 'wb') as f_op:
    pkl.dump(label_train, f_op)

Shape of the data train: (37099, 16518)
Shape of the label train: (37099, 1)
