In [11]:
#import libraries

import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import random
import re
import string

In [1]:
#take a subsample of 600000 from the data to work with, since there are more than 1.2 million rows

total_rows = 1264216
sample_size = 600000
skiprows = sorted(random.sample(range(total_rows), total_rows-sample_size))

dtype = {'Id': pd.Int64Dtype(), 'OwnerUserId': pd.Int64Dtype(), 'CreationDate': np.datetime64, 'ClosedDate': np.datetime64, 'Title': object, 'Body': object}
data = pd.read_csv('Questions.csv', engine='python', dtype=dtype, skiprows=skiprows)
data.head()

In [2]:
#get the respective tags for the selected subsample of the questions

tags = pd.read_csv('Tags.csv')
tags = tags[tags['Id'].isin(data['Id'])]
tags.head()

In [6]:
#select features to use

X = data[['Id', 'Title', 'Body']]
X.head()

In [2]:
#strip html tags from question body using beautifulsoup

for i in range(X.shape[0]):
    X.loc[X.index[i], 'Body'] = BeautifulSoup(X.loc[X.index[i], 'Body']).get_text()

In [2]:
#convert to lower case

for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = X.loc[X.index[i], 'Title'].lower()
    X.loc[X.index[i], 'Body'] = X.loc[X.index[i], 'Body'].lower()

In [4]:
#remove numbers

for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = re.sub(r'\d+', '', X.loc[X.index[i], 'Title'])
    X.loc[X.index[i], 'Body'] = re.sub(r'\d+', '', X.loc[X.index[i], 'Body'])

In [5]:
#remove punctuation

for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = X.loc[X.index[i], 'Title'].translate(X.loc[X.index[i], 'Title'].maketrans('', '', string.punctuation))
    X.loc[X.index[i], 'Body'] = X.loc[X.index[i], 'Body'].translate(X.loc[X.index[i], 'Body'].maketrans('', '', string.punctuation))

In [2]:
#remove whitespaces

for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = X.loc[X.index[i], 'Title'].strip()
    X.loc[X.index[i], 'Body'] = X.loc[X.index[i], 'Body'].strip()

In [2]:
#remove stop words

stopwords = set(nltk.corpus.stopwords.words('english'))
for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = " ".join([word for word in nltk.tokenize.word_tokenize(X.loc[X.index[i], 'Title']) if not word in stopwords])
    X.loc[X.index[i], 'Body'] = " ".join([word for word in nltk.tokenize.word_tokenize(X.loc[X.index[i], 'Body']) if not word in stopwords])

In [2]:
#perform stemming

stemmer = nltk.stem.SnowballStemmer('english')
for i in range(X.shape[0]):
    X.loc[X.index[i], 'Title'] = " ".join([stemmer.stem(word) for word in nltk.tokenize.word_tokenize(X.loc[X.index[i], 'Title'])])
    X.loc[X.index[i], 'Body'] = " ".join([stemmer.stem(word) for word in nltk.tokenize.word_tokenize(X.loc[X.index[i], 'Body'])])

In [96]:
#save the dataframes as feathers

data.to_feather('./data.ftr')
X.to_feather('./X.ftr')

#sliced dataframes need to be reindexed or converted to csv before saving as feathers

tags.to_csv('./tags.csv', index=False)
tags = pd.read_csv('./tags.csv')
tags.to_feather('./tags.ftr')