In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import save_npz, load_npz

In [2]:
contraction_map = {
    # Negative contractions
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    
    # Pronoun contractions
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    
    # Misc contractions
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    
    # Informal / common text contractions
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}


In [3]:
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

In [4]:
#df = pd.read_csv('Dataset/cleaned.csv')

In [5]:
df = pd.read_csv('Dataset/cleaned.csv')
df = df.dropna()
unique_df = df[['question',"Id"]].drop_duplicates()
temp = unique_df['question'].str.lower().str.split()

In [6]:
counts = defaultdict(int)
for doc in tqdm.tqdm(temp):
    for word in set(doc):
            counts[word] +=1
idf = {word: np.log(len(temp)/counts[word]) for word in counts}

100%|██████████| 538903/538903 [00:07<00:00, 72589.27it/s]


In [7]:
tf_idf_list = []
for doc in tqdm.tqdm(temp):
    tf = Counter(doc)
    length = len(doc)
    tf_idf_doc = {}
    for word,count in tf.items():
            tf_idf_doc[word] = (count/length) * idf[word]
    tf_idf_list.append(tf_idf_doc)

100%|██████████| 538903/538903 [00:10<00:00, 51509.69it/s]


In [8]:
unique_words = {word: i for i, word in enumerate(sorted(idf.keys()))}

In [9]:
rows, cols, data = [],[],[]
for i, tf_idf_doc in enumerate(tf_idf_list):
    for word,val in tf_idf_doc.items():
        if word in idf.keys():
            rows.append(i)  
            cols.append(unique_words[word])
            data.append(val)
tf_idf_matrix = csr_matrix((data,(rows,cols)), shape = (len(tf_idf_list),len(unique_words)))

In [14]:
pd.DataFrame(list(unique_words.items()), columns=['word', 'index']).to_csv('Dataset/word_to_index.csv', index=False)

In [15]:
pd.DataFrame(sorted(idf.items()), columns=['word', 'idf_score']).to_csv('Dataset/idf.csv', index=False)

In [16]:
save_npz('Dataset/sparse_matrix.npz', tf_idf_matrix)