In [1]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

df = pd.read_csv('data/Questions.csv', encoding="ISO-8859-1", nrows=2000,
                 usecols=['Id', 'Title', 'Body', 'CreationDate', 'Score'])

df

Unnamed: 0,Id,CreationDate,Score,Title,Body
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2008-08-01T18:42:19Z,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...
1995,162650,2008-10-02T14:38:16Z,4,"How do I detect ""Easter Egg"" mode in my Palm O...","<p>Since the early days, Palm OS has had a spe..."
1996,162680,2008-10-02T14:42:33Z,14,The value of hobby game development,<p>Does attempting to develop some sort of gam...
1997,162730,2008-10-02T14:49:46Z,6,Rendered pIxel width data for each character i...,<p>I have a table column that needs to be limi...
1998,162810,2008-10-02T15:00:21Z,26,How do you log the machine name via log4net?,<p>I am using Log4Net with the AdoNetAppender ...


## Remove stopwords

Function to remove stopwords

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize

nltk.download('stopwords')
nltk.download("punkt")

stopwords = set(nltk.corpus.stopwords.words('english'))

def lower_remove_punctuation(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    text_tokens = tokenizer.tokenize(text)
    return ' '.join(text_tokens)

def remove_stop_words(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw = [w for w in text_tokens if not w in stopwords and w.isalnum()]
    return ' '.join(tokens_without_sw)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladislavkruglikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Lemmatize

Function to lemmatize text

In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text_tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in text_tokens]
    return ' '.join(lemmatized_tokens)

## Title preprocessing

Remoing stop words, and apply lemmatization

In [5]:
title = df['Title']

df['Title preprocessed'] = title.apply(lower_remove_punctuation).apply(remove_stop_words).apply(lemmatize)

## Body preprocessing

Since ```Body``` contains many code fragments and we don't want to use code fragments in search we need to get rid of it

In [6]:
import re

body = df['Body']

def remove_spaces(text):
    return ' '.join(text.split())

def remove_code(text):
    # remove code part
    text = re.sub('<pre>.*<\/pre>', '', text)
    # remove tags
    text = re.sub('<a.*?>', '', text)
    text = re.sub('</a>', '', text)
    text = re.sub('<code>', '', text)
    text = re.sub('</code>', '', text)
    text = re.sub('<p>', '', text)
    text = re.sub('</p>', '', text)
    return text

df['Body preprocessed'] = body.apply(lower_remove_punctuation).apply(remove_spaces).apply(remove_code).apply(remove_stop_words).apply(lemmatize)

## Fit TF-IDF title vectorizer

Fit TF-IDF vectorizer for title column

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

tfidf_title_vectorizer = TfidfVectorizer()

title_preprocessed = df['Title preprocessed']

tfidf_title_vectorizer.fit(title_preprocessed)

pickle.dump(tfidf_title_vectorizer, open('data/tfidf_title_vectorizer.sav', 'wb'))

## Extract keywords from TF-IDF title vectorizer

Get top 10 words with highest TF-IDF from each title

In [8]:
def get_top_tfidf_words(sentence, tfidf_vectorizer, n=5):
    terms = tfidf_vectorizer.get_feature_names()
    sums = tfidf_vectorizer.transform([sentence]).sum(axis=0)
    data = []
    for col, term in enumerate(terms):
        if sums[0, col] > 0:
            data.append((term, sums[0, col]))
    data.sort(key=lambda x: -x[1])
    return ' '.join([i[0] for i in data[:n]])

df['Title preprocessed top 10 keywords'] = title_preprocessed.apply(lambda text: get_top_tfidf_words(text, tfidf_title_vectorizer, n=5))

## Fit TF-IDF body vectorizer

Fit TF-IDF vectorizer for ```body``` column

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_body_vectorizer = TfidfVectorizer()

body_preprocessed = df['Body preprocessed']

tfidf_body_vectorizer.fit(body_preprocessed)

pickle.dump(tfidf_body_vectorizer, open('data/tfidf_body_vectorizer.sav', 'wb'))

## Extract keywords from TF-IDF body vectorizer

Get top 30 words with highest TF-IDF from each body

In [10]:
df['Body preprocessed top 30 keywords'] = body_preprocessed.apply(lambda text: get_top_tfidf_words(text, tfidf_body_vectorizer, n=30))

## Get date score

The newest questions have higher score

In [11]:
df['timestamp'] = pd.DatetimeIndex(df.CreationDate).asi8

date_min = df.timestamp.min()
date_max = df.timestamp.max()

def date_score(date):
    return 1 + (date - date_min) / (date_max - date_min)

df['Date score'] = df.timestamp.apply(date_score)

## Get Vote score

The questions with more votes are probably better

In [12]:
votes = df['Score']

votes_min = votes.min()
votes_max = votes.max()

def votes_score(votes):
    return 1 + (votes - votes_min) / (votes_max - votes_min)

df['Votes score'] = df['Score'].apply(votes_score)

## Calculate TF-IDF vectors for title preprocessed top 10 keywords

Will use this to calculate distance between query TF-IDF vector and database documents TF-IDF vectors to find the most similar

In [13]:
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

df['Title preprocessed top 10 keywords TF-IDF vector'] = df['Title preprocessed top 10 keywords'].apply(lambda x: tfidf_title_vectorizer.transform([x]))

def calculate_distance(vec1, vec2):
    cosine_similarities = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))
    return cosine_similarities[0][0]

In [14]:
# For example

query = "multiple query one sql"

calculate_distance(df['Title preprocessed top 10 keywords TF-IDF vector'][0], tfidf_title_vectorizer.transform([query]))

0.4012645297636559

## Calculate TF-IDF vectors for body preprocessed top 30 keywords

Will use this to calculate distance between query TF-IDF vector and database documents TF-IDF vectors to find the most similar

In [15]:
df['Body preprocessed top 30 keywords TF-IDF vector'] = df['Body preprocessed top 30 keywords'].apply(lambda x: tfidf_body_vectorizer.transform([x]))

## Build Inverted Index for title

Function for creating inverted index to allow us do the first stage of search

In [16]:
from collections import defaultdict

def build_inverted_index(data: list) -> list:
    inverted_index = defaultdict(list)
    rows = len(data)
    for idx in range(1, rows):
        sample = df.iloc[idx]
        for word in set(data[idx].split()):
            inverted_index[word].append(sample.Id)
    return inverted_index

title_inverted_index = build_inverted_index(df['Title preprocessed top 10 keywords'])
pickle.dump(title_inverted_index, open('data/title_inverted_index.sav', 'wb'))

## Build Inverted Index for body

Create different inverted index for body

In [17]:
body_inverted_index = build_inverted_index(df['Body preprocessed top 30 keywords'])
pickle.dump(body_inverted_index, open('data/body_inverted_index.sav', 'wb'))

## Clear not needed cols

Save space by removing columns that we don't need

In [18]:
df.drop(columns=['CreationDate', 'timestamp', 'Score', 'Title preprocessed', 'Body preprocessed',
                 'Title preprocessed top 10 keywords', 'Body preprocessed top 30 keywords'], inplace=True)

## Look at the final dataset

Our dataset after all transformations

In [19]:
df

Unnamed: 0,Id,Title,Body,Date score,Votes score,Title preprocessed top 10 keywords TF-IDF vector,Body preprocessed top 30 keywords TF-IDF vector
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,1.000000,1.008020,"(0, 2522)\t0.44092916579466235\n (0, 2502)\...","(0, 12280)\t0.14965085848996112\n (0, 11933..."
1,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,1.000496,1.040653,"(0, 2766)\t0.44673955305001606\n (0, 2718)\...","(0, 11527)\t0.20752904045981943\n (0, 11362..."
2,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,1.001265,1.006637,"(0, 2420)\t0.5700230506438555\n (0, 1720)\t...","(0, 12463)\t0.1584405084728794\n (0, 12366)..."
3,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1.003192,1.015487,"(0, 2935)\t0.5985527917055988\n (0, 1066)\t...","(0, 12168)\t0.1135941802842356\n (0, 11261)..."
4,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,1.006323,1.014381,"(0, 2324)\t0.5579416322563129\n (0, 1720)\t...","(0, 11407)\t0.24273624456573292\n (0, 10340..."
...,...,...,...,...,...,...,...
1995,162650,"How do I detect ""Easter Egg"" mode in my Palm O...","<p>Since the early days, Palm OS has had a spe...",1.999541,1.001936,"(0, 1866)\t0.46452687942678234\n (0, 1646)\...","(0, 11523)\t0.1736861396348638\n (0, 11459)..."
1996,162680,The value of hobby game development,<p>Does attempting to develop some sort of gam...,1.999589,1.004701,"(0, 2848)\t0.3918436193550127\n (0, 1198)\t...","(0, 12153)\t0.1868131460456036\n (0, 11918)..."
1997,162730,Rendered pIxel width data for each character i...,<p>I have a table column that needs to be limi...,1.999670,1.002489,"(0, 2944)\t0.4610126117448486\n (0, 2180)\t...","(0, 12274)\t0.16958930694063776\n (0, 12271..."
1998,162810,How do you log the machine name via log4net?,<p>I am using Log4Net with the AdoNetAppender ...,1.999788,1.008020,"(0, 2876)\t0.43840683427196825\n (0, 1701)\...","(0, 12168)\t0.08214542688555762\n (0, 12140..."


## Save preprocessed dataset

Save dataset to pickle file, it's better to use pickle because it allows us to save ```TF-IDF vector``` scipy format of column, because pandas will force it to be string

In [20]:
pickle.dump(df, open('data/Preprocessed Questions.sav', 'wb'))