In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import re
import tqdm
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import save_npz, load_npz
import functions

In [10]:
df = pd.read_csv('Dataset/cleaned.csv')
df.isna().sum()

Unnamed: 0          0
Id                  0
Tag                 0
Score_question      0
question          596
Score_answer        0
Body_answer         0
dtype: int64

In [4]:
df = pd.read_csv('Dataset/cleaned.csv')
df = df.dropna() #drop nulls, seem to be caused from saving the csv, but it is only 600 rows out of 1 million
unique_df = df[['question',"Id"]].drop_duplicates() #get all questions
temp = unique_df['question'].str.lower().str.split() #split all words

In [6]:
counts = defaultdict(int)
for doc in temp:
    for word in set(doc):
            counts[word] +=1
idf = {word: np.log(len(temp)/counts[word]) for word in counts}

100%|██████████| 538903/538903 [00:07<00:00, 72589.27it/s]


Creates idf dictonary. Casts each question to a set to avoid counting duplicates in a question.

In [7]:
tf_idf_list = []
for doc in temp:
    tf = Counter(doc)
    length = len(doc)
    tf_idf_doc = {}
    for word,count in tf.items():
            tf_idf_doc[word] = (count/length) * idf[word]
    tf_idf_list.append(tf_idf_doc)

100%|██████████| 538903/538903 [00:10<00:00, 51509.69it/s]


Creates a list of dicts with one dict for every question. Each dict is calculated by first getting the counts for words in each sentences and computing the term frequency. Then we multiply by the idf for that word calculated previously.

In [8]:
unique_words = {word: i for i, word in enumerate(sorted(idf.keys()))}

In [9]:
rows, cols, data = [],[],[]
for i, tf_idf_doc in enumerate(tf_idf_list):
    for word,val in tf_idf_doc.items():
        if word in idf.keys():
            rows.append(i)  
            cols.append(unique_words[word])
            data.append(val)
tf_idf_matrix = csr_matrix((data,(rows,cols)), shape = (len(tf_idf_list),len(unique_words)))

Creates a sparse matrix representation of the tf_idf. This is important because a full matrix or pandas df takes up way too much memory.

In [14]:
pd.DataFrame(list(unique_words.items()), columns=['word', 'index']).to_csv('Dataset/word_to_index.csv', index=False)
pd.DataFrame(sorted(idf.items()), columns=['word', 'idf_score']).to_csv('Dataset/idf.csv', index=False) 
save_npz('Dataset/sparse_matrix.npz', tf_idf_matrix)

Saves idf, tf_idf matrix, and unique words indices in the tf_idf matrix. Idf and tf_idf are needed to process user querys later. Unique words is important because dictonaries are not in the same order always after being saved. This lets us know which columns in the tf_idf_matrix correspond to which words.