# Preprocessing of data + Language Model creation
### imports

In [None]:
import numpy as np
import re
import pandas as pd
import pickle
from nltk.probability import FreqDist
from nltk import bigrams
import nltk
import queue
from threading import Thread
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

### Load dataset and filter, then save pickle
each model or important dataset will be saved as a pickle file to load (more quickly than creating it) in the retrieval part (310 MB)

In [None]:
lyrics = pd.read_csv('lyrics.csv')

In [None]:
lyrics = lyrics.drop(['year', 'genre'], axis=1) #Don't need the year or genre data
lyrics['song'] = lyrics['song'].str.replace('-', ' '); #song names are spaced with dashes
lyrics['artist'] = lyrics['artist'].str.replace('-', ' '); #artist names are spaced with dashes
lyrics = lyrics[lyrics.song != ''] #remove empty song title
lyrics = lyrics[lyrics.lyrics != ''] #remove empty lyrics
lyrics = lyrics.drop(['index'], axis=1) #drop index column
lyrics = lyrics.dropna(how='any').reset_index() #drop empty data rows
lyrics1 = lyrics.copy()
lyrics['lyrics'] = lyrics['lyrics'].str.replace(r'\W', ' ', regex=True) #punctuation to whitespace
lyrics['lyrics'] = lyrics['lyrics'].str.lower() #lower case lyrics

### Tokenize and stem lyrics (then save pickle)
This results in a list containing a list for every song with the stemmed words in that song (800MB)

In [None]:
tokenized_lyrics = lyrics['lyrics'].apply(word_tokenize) #tokenize 

In [None]:
stemmer = PorterStemmer()
tokenized_lyrics = [[stemmer.stem(w) for w in song] for song in tokenized_lyrics]

### Create unigram Language Model for each song from the tokenized lyrics and save
This results in a list of dictionaries. Each dictionary belongs to a song and contains counts of each term. To get the sampled probability of a term call .freq(term) (400 MB) From this we will later create an inverted index (may be a detour but we only thought of inverted index after this)

In [None]:
FreqDistList = [FreqDist(terms) for terms in tokenized_lyrics]

### Merge the list of dictionaries into one to create the unigram Language Model for the collection and save
This is quite an expensive operation but still doable like this because the number of unique terms is not crazy big. Results in one dictionary with all terms of all lyrics with their counts in the collection (7 MB)

In [None]:
collection = Counter({})
for i in range(len(FreqDictList)):
    collection+=FreqDistList[i]

In [None]:
collection = dict(collection)

### Create bigrams for each song
This results in the same structure as tokenized_lyrics (3 steps above) but containing bigrams

In [None]:
lyrics_bigrams = [list(nltk.bigrams(x)) for x in tokenized_lyrics]

### Create bigram Language Model for each song from the lyrics bigrams and save
Creates the list of dictionaries (one for each song) containing the bigrams and their counts (1.12 GB) From this we will later create an inverted index (may be a detour but we only thought of inverted index after this)

In [None]:
bigram_freqlist = [FreqDist(bigrams) for bigrams in lyrics_bigrams]

### Merge the list of bigram dictionairies into one to create the bigram Language Model for the collection and save
This is now a very expensive operation because there exist many many unique bigrams, to merge dictionaries means to check for each key whether it exists in the other dictionary before adding to the value or creating a new key, therefore growing in cost in the length of dictionaries to merge and also the number of dictionaries. To counter this I split the list of bigram language models recursively and started a few threads to work on it. This is likely not the most efficient way but it worked. (150 MB)

In [None]:
queue = queue.Queue() #queue to contain intermediate results from threads

In [None]:
#merge all the dictionaries in the list_of_dictionaries into one and put it in the queue (for easy synchronization)
def merge_dicts(list_of_dicts):
    result = Counter({})
    for d in list_of_dicts:
        result+=d
    queue.put(result)

In [None]:
#split a list in half
def split_list(l):
    return l[:len(l)//2], l[len(l)//2:]

In [None]:
threads = []
#split the list of bigram models recursively and add each job to the list of threads
def recursive_threads(dict_list, factor=4): #32 threads
    l1, l2 = split_list(dict_list)
    if factor > 0:
        recursive_threads(l1, factor-1)
        recursive_threads(l2, factor-1)
    else:
        threads.append(Thread(target=merge_dicts, args=(l1,)))
        threads.append(Thread(target=merge_dicts, args=(l2,)))

In [None]:
recursive_threads(bigram_freqlist) #Create jobs

In [None]:
#start threads and wait for completion
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

In [None]:
#now there are dictionaries in queue (one for each thread). this few we can handle to merge 
bigram_collection = Counter({})
for d in list(queue.queue):
    bigram_collection+=d

In [None]:
bigram_collection = dict(bigram_collection)

## Inverted language model!
For terms and bigrams seperately for efficiency: no locations needed

Data structure will be the following:<br>
{term : [ total_count, {index:count, ...} ],<br>
  term : ...}
  
'term' for unigram model and 'bigram' for bigram model

In [None]:
inverted_terms = {}

for i in range(len(FreqDistList)):
    for term in dict(FreqDistList[i]):
        if term in inverted_terms:
            inverted_terms[term].update({i:dict(FreqDistList[i])[term]})
        else:
            inverted_terms.update({term:{i:dict(FreqDistList[i])[term]}})

In [None]:
inverted_bigrams = {}

for i in range(len(bigram_freqlist)):
    for bigram in dict(bigram_freqlist[i]):
        if bigram in inverted_bigrams:
            inverted_bigrams[bigram].update({i:dict(bigram_freqlist[i])[bigram]})
        else:
            inverted_bigrams.update({bigram:{i:dict(bigram_freqlist[i])[bigram]}})

In [None]:
song_total_terms = [sum(x.values()) for x in FreqDistList]
song_total_bigrams = [sum(x.values()) for x in bigram_freqlist]

In [None]:
for key in inverted_bigrams:
    inverted_bigrams[key]=[bigram_collection[key], inverted_bigrams[key]]
for key in inverted_terms:
    inverted_terms[key]=[collection[key], inverted_terms[key]]