In [1]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

df = pd.read_csv('data/Questions.csv', encoding="ISO-8859-1",
                 usecols=['Id', 'Title', 'Body', 'CreationDate', 'Score'])

df

Unnamed: 0,Id,CreationDate,Score,Title,Body
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2008-08-01T18:42:19Z,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...
9995,637840,2009-03-12T09:17:44Z,4,geographic location uri scheme,<p>I'd like to use a URI scheme to enable the ...
9996,637870,2009-03-12T09:32:13Z,6,how to send signal from one program to another?,<p>i am using message queue as an ipc between ...
9997,637900,2009-03-12T09:43:34Z,1,How to conditionally compile VC6 resources,<p>depending on a compile switch (values are ...
9998,637910,2009-03-12T09:44:57Z,0,Getprivateprofilestring Bug,<p>I encrypted some text and put it in a INI f...


## Remove stopwords

Function to remove stopwords

In [3]:
from nltk import word_tokenize

nltk.download('stopwords')
nltk.download("punkt")

stopwords = set(nltk.corpus.stopwords.words('english'))

def remove_stop_words(text):
    text = text.lower()
    text_tokens = word_tokenize(text)
    tokens_without_sw = [w for w in text_tokens if not w in stopwords and w.isalnum()]
    return ' '.join(tokens_without_sw)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Lemmatize

Function to lemmatize text

In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text = text.lower()
    text_tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in text_tokens]
    return ' '.join(lemmatized_tokens)

## Title preprocessing

Remoing stop words, and apply lemmatization

In [5]:
title = df['Title']

df['Title preprocessed'] = title.apply(remove_stop_words).apply(lemmatize)

## Body preprocessing

Since ```Body``` contains many code fragments and we don't want to use code fragments in search we need to get rid of it

In [6]:
import re

body = df['Body']

def remove_spaces(text):
    return ' '.join(text.split())

def remove_code(text):
    # remove code part
    text = re.sub('<pre>.*<\/pre>', '', text)
    # remove tags
    text = re.sub('<a.*?>', '', text)
    text = re.sub('</a>', '', text)
    text = re.sub('<code>', '', text)
    text = re.sub('</code>', '', text)
    text = re.sub('<p>', '', text)
    text = re.sub('</p>', '', text)
    return text

df['Body preprocessed'] = body.apply(remove_spaces).apply(remove_code).apply(remove_stop_words).apply(lemmatize)

## Fit TF-IDF title vectorizer

Fit TF-IDF vectorizer for title column

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

tfidf_title_vectorizer = TfidfVectorizer()

title_preprocessed = df['Title preprocessed']

tfidf_title_vectorizer.fit(title_preprocessed)

pickle.dump(tfidf_title_vectorizer, open('data/tfidf_title_vectorizer.sav', 'wb'))

## Extract keywords from TF-IDF title vectorizer

Get top 10 words with highest TF-IDF from each title

In [8]:
def get_top_tfidf_words(sentence, tfidf_vectorizer, n=5):
    terms = tfidf_vectorizer.get_feature_names()
    sums = tfidf_vectorizer.transform([sentence]).sum(axis=0)
    data = []
    for col, term in enumerate(terms):
        if sums[0, col] > 0:
            data.append((term, sums[0, col]))
    data.sort(key=lambda x: -x[1])
    return ' '.join([i[0] for i in data[:n]])

df['Title preprocessed top 10 keywords'] = title_preprocessed.apply(lambda text: get_top_tfidf_words(text, tfidf_title_vectorizer, n=5))

## Fit TF-IDF body vectorizer

Fit TF-IDF vectorizer for ```body``` column

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_body_vectorizer = TfidfVectorizer()

body_preprocessed = df['Body preprocessed']

tfidf_body_vectorizer.fit(body_preprocessed)

pickle.dump(tfidf_body_vectorizer, open('data/tfidf_body_vectorizer.sav', 'wb'))

## Extract keywords from TF-IDF body vectorizer

Get top 30 words with highest TF-IDF from each body

In [10]:
df['Body preprocessed top 30 keywords'] = body_preprocessed.apply(lambda text: get_top_tfidf_words(text, tfidf_body_vectorizer, n=30))

## Get date score

The newest questions have higher score

In [11]:
df['timestamp'] = pd.DatetimeIndex(df.CreationDate).asi8

date_min = df.timestamp.min()
date_max = df.timestamp.max()

def date_score(date):
    return 1 + (date - date_min) / (date_max - date_min)

df['Date score'] = df.timestamp.apply(date_score)

## Get Vote score

The questions with more votes are probably better

In [12]:
votes = df['Score']

votes_min = votes.min()
votes_max = votes.max()

def votes_score(votes):
    return 1 + (votes - votes_min) / (votes_max - votes_min)

df['Votes score'] = df['Score'].apply(votes_score)

## Calculate TF-IDF vectors for title preprocessed top 10 keywords

Will use this to calculate distance between query TF-IDF vector and database documents TF-IDF vectors to find the most similar

In [13]:
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

df['Title preprocessed top 10 keywords TF-IDF vector'] = df['Title preprocessed top 10 keywords'].apply(lambda x: tfidf_title_vectorizer.transform([x]))

def calculate_distance(vec1, vec2):
    cosine_similarities = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))
    return cosine_similarities[0][0]

In [14]:
# For example

query = "multiple query one sql"

calculate_distance(df['Title preprocessed top 10 keywords TF-IDF vector'][0], tfidf_title_vectorizer.transform([query]))

0.7426128307642093

## Calculate TF-IDF vectors for body preprocessed top 30 keywords

Will use this to calculate distance between query TF-IDF vector and database documents TF-IDF vectors to find the most similar

In [15]:
df['Body preprocessed top 30 keywords TF-IDF vector'] = df['Body preprocessed top 30 keywords'].apply(lambda x: tfidf_body_vectorizer.transform([x]))

## Build Inverted Index for title

Function for creating inverted index to allow us do the first stage of search

In [16]:
from collections import defaultdict

def build_inverted_index(data: list) -> list:
    inverted_index = defaultdict(list)
    rows = len(data)
    for idx in range(1, rows):
        sample = df.iloc[idx]
        union = sample['Title']
        for word in set(union.split()):
            inverted_index[word].append(sample.Id)
    return inverted_index

title_inverted_index = build_inverted_index(df['Title preprocessed top 10 keywords'])
pickle.dump(title_inverted_index, open('data/title_inverted_index.sav', 'wb'))

## Build Inverted Index for body

Create different inverted index for body

In [17]:
body_inverted_index = build_inverted_index(df['Body preprocessed top 30 keywords'])
pickle.dump(body_inverted_index, open('data/body_inverted_index.sav', 'wb'))

## Clear not needed cols

Save space by removing columns that we don't need

In [18]:
df.drop(columns=['CreationDate', 'timestamp', 'Score', 'Title preprocessed', 'Body preprocessed',
                 'Title preprocessed top 10 keywords', 'Body preprocessed top 30 keywords'], inplace=True)

## Look at the final dataset

Our dataset after all transformations

In [19]:
df

Unnamed: 0,Id,Title,Body,Date score,Votes score,Title preprocessed top 10 keywords TF-IDF vector,Body preprocessed top 30 keywords TF-IDF vector
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,1.000000,1.005967,"(0, 5686)\t0.5733686742526269\n (0, 4713)\t...","(0, 20409)\t0.1903564121651477\n (0, 19968)..."
1,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,1.000138,1.028681,"(0, 6246)\t0.43515936734463906\n (0, 6145)\...","(0, 18905)\t0.2798568366114675\n (0, 18636)..."
2,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,1.000352,1.005005,"(0, 5454)\t0.6839994312478623\n (0, 3517)\t...","(0, 20513)\t0.16898679138737768\n (0, 20337..."
3,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1.000889,1.011165,"(0, 6644)\t0.6334621034829302\n (0, 2363)\t...","(0, 19968)\t0.12484057990033459\n (0, 18485..."
4,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,1.001761,1.010395,"(0, 5236)\t0.5805010341651897\n (0, 2365)\t...","(0, 19929)\t0.15686951029358692\n (0, 18692..."
...,...,...,...,...,...,...,...
9995,637840,geographic location uri scheme,<p>I'd like to use a URI scheme to enable the ...,1.999898,1.001732,"(0, 6411)\t0.48820269651697795\n (0, 5214)\...","(0, 20176)\t0.24971727900849272\n (0, 19476..."
9996,637870,how to send signal from one program to another?,<p>i am using message queue as an ipc between ...,1.999943,1.002117,"(0, 5420)\t0.5632876373150607\n (0, 5303)\t...","(0, 19928)\t0.10054315161189521\n (0, 19531..."
9997,637900,How to conditionally compile VC6 resources,<p>depending on a compile switch (values are ...,1.999979,1.001155,"(0, 6478)\t0.5681702643512242\n (0, 5014)\t...","(0, 20319)\t0.07966154121627272\n (0, 19740..."
9998,637910,Getprivateprofilestring Bug,<p>I encrypted some text and put it in a INI f...,1.999983,1.000962,"(0, 2433)\t0.8132815082660368\n (0, 734)\t0...","(0, 20408)\t0.14155610471070454\n (0, 18279..."


## Save preprocessed dataset

Save dataset to pickle file, it's better to use pickle because it allows us to save ```TF-IDF vector``` scipy format of column, because pandas will force it to be string

In [20]:
pickle.dump(df, open('data/Preprocessed Questions.sav', 'wb'))