## Data Exploration
This notebook will allow us to explore the data found in the Kaggle dataset

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import html
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


In [2]:
"""questions = pd.read_csv("Dataset/Questions.csv",encoding='latin-1')
answers = pd.read_csv("Dataset/Answers.csv",encoding='latin-1')
tags = pd.read_csv("Dataset/Tags.csv",encoding='latin-1')
tag_question = tags.groupby('Id').agg(list).merge(questions,how='inner',on = "Id")
df = tag_question.merge(answers,how = "inner",left_on = "Id", right_on = "ParentId")
df.columns = df.columns.str.replace("_x","_question").str.replace("_y","_answer")
df = df[['Id','Tag','Score_question','Title','Body_question',"Score_answer","Body_answer"]]"""

'questions = pd.read_csv("Dataset/Questions.csv",encoding=\'latin-1\')\nanswers = pd.read_csv("Dataset/Answers.csv",encoding=\'latin-1\')\ntags = pd.read_csv("Dataset/Tags.csv",encoding=\'latin-1\')\ntag_question = tags.groupby(\'Id\').agg(list).merge(questions,how=\'inner\',on = "Id")\ndf = tag_question.merge(answers,how = "inner",left_on = "Id", right_on = "ParentId")\ndf.columns = df.columns.str.replace("_x","_question").str.replace("_y","_answer")\ndf = df[[\'Id\',\'Tag\',\'Score_question\',\'Title\',\'Body_question\',"Score_answer","Body_answer"]]'

In [3]:
"""sample_df = df.head(10000)
sample_df.to_csv("Dataset/sample")"""

'sample_df = df.head(10000)\nsample_df.to_csv("Dataset/sample")'

In [4]:
contraction_map = {
    # Negative contractions
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    
    # Pronoun contractions
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    
    # Misc contractions
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    
    # Informal / common text contractions
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}


In [5]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

In [6]:
df = pd.read_csv("Dataset/total")

In [7]:
df['Body_question'] = (df['Body_question']
    .apply(html.unescape)
    .str.replace(r'<[a-zA-Z/][^>]*>', '', regex=True)  # Only remove HTML tags
    .str.replace(r'\n+', ' ', regex=True)              # Only collapse newlines
    .str.replace(r'  +', ' ', regex=True)              # Only collapse multiple spaces
    .str.replace('\r','')
    .str.replace('’',"'")
    .str.lower()
    .str.strip())

In [8]:
df['Body_answer'] = (df['Body_answer']
    .apply(html.unescape)
    .str.replace(r'<[a-zA-Z/][^>]*>', '', regex=True)  # Only remove HTML tags
    .str.replace(r'\n+', ' ', regex=True)              # Only collapse newlines
    .str.replace(r'  +', ' ', regex=True)              # Only collapse multiple spaces
    .str.replace('\r','')
    .str.replace('’',"'")
    .str.lower()
    .str.strip())

In [9]:
df['Title'] = (df['Title']  # Only remove HTML tags
    .str.replace(r'\n+', ' ', regex=True)              # Only collapse newlines
    .str.replace(r'  +', ' ', regex=True)              # Only collapse multiple spaces
    .str.replace('\r','')
    .str.lower()
    .str.replace('’',"'")
    .str.strip())

In [10]:
stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # 1. Tokenize first (keeps spacing intact)
    text = expand_contractions(text)
    
    tokens = word_tokenize(text)

    # 2. Lowercase and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # 3. Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    # 4. Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    # 5. Rejoin safely with spaces
    return " ".join(tokens)

In [11]:
"""df['Body_question'] = df['Body_question'].apply(preprocess_text)
df['Body_answer'] = df['Body_answer'].apply(preprocess_text)
df['Title'] = df['Title'].apply(preprocess_text)""" #Takes 20 min to run on full dataset

In [14]:
df.to_csv('Dataset/cleaned')