## Data Exploration
This notebook will allow us to explore the data found in the Kaggle dataset

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import html
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
import functions #personal functions


In [2]:
questions = pd.read_csv("Dataset/Questions.csv",encoding='latin-1')
answers = pd.read_csv("Dataset/Answers.csv",encoding='latin-1')
tags = pd.read_csv("Dataset/Tags.csv",encoding='latin-1')
tag_question = tags.groupby('Id').agg(list).merge(questions,how='inner',on = "Id")
df = tag_question.merge(answers,how = "inner",left_on = "Id", right_on = "ParentId")
df.columns = df.columns.str.replace("_x","_question").str.replace("_y","_answer")
df = df[['Id','Tag','Score_question','Title','Body_question',"Score_answer","Body_answer"]]

Reads in all three files, and merges them together. See result below.

In [10]:
df.set_index("Id").loc[308999]

Unnamed: 0_level_0,Tag,Score_question,Title,Body_question,Score_answer,Body_answer
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
308999,"[python, decorator, wraps]",265,what does functools.wraps do?,in a comment on the answer to another question...,467,"When you use a decorator, you're replacing one..."
308999,"[python, decorator, wraps]",265,what does functools.wraps do?,in a comment on the answer to another question...,11,"I very often use classes, rather than function..."


In [6]:
df['Body_question'] = (df['Body_question']
    .apply(html.unescape)
    .str.replace(r'<[a-zA-Z/][^>]*>', '', regex=True)  #remove HTML tags
    .str.replace(r'\n+', ' ', regex=True)              #collapse newlines
    .str.replace(r'  +', ' ', regex=True)              #collapse multiple spaces
    .str.replace('\r','')
    .str.replace('’',"'")
    .str.lower()
    .str.strip())

In [7]:
df['Body_answer'] = (df['Body_answer']
    .apply(html.unescape)
    .str.replace(r'<[a-zA-Z/][^>]*>', '', regex=True)  #remove HTML tags
)

In [9]:
df['Title'] = (df['Title']
    .str.replace(r'\n+', ' ', regex=True)              #collapse newlines
    .str.replace(r'  +', ' ', regex=True)              #collapse multiple spaces
    .str.replace('\r','')
    .str.lower()
    .str.replace('’',"'")
    .str.strip())

In [11]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Expand contractions
    text = functions.expand_contractions(text)
    
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [11]:
df['Body_question'] = df['Body_question'].apply(preprocess_text) #applys preprocess to the whole dataset
df['Title'] = df['Title'].apply(preprocess_text) #Takes 20 min to run on full dataset

KeyboardInterrupt: 

In [17]:
df['question'] = df['Title'] + " " + df['Body_question'] #combine Title and Body for questions

In [21]:
df = df[['Id','Tag','Score_question','question','Score_answer','Body_answer']] #Keep relevant columns

In [22]:
df.to_csv('Dataset/cleaned.csv', index=False)