## 1. Prepare data and preprocess

In [15]:
import nltk

with open('data/Course-Descriptions.txt', 'r') as f:
    raw = f.read()
    
token_list = nltk.word_tokenize(raw)

# Define a function to: 1. remove punctuation 2. replace special character 3. convert to lower case
def process_token(tokens):
    tokens = list(token.replace("'", "") for token in tokens)
    tokens = list(token.lower() for token in tokens)
    for token in tokens:
        if not token.isalpha():
            tokens.remove(token)
    
    return tokens

token_list_final = process_token(token_list)

print("Number of tokens:", len(token_list_final))
print("\nSome sample tokens:", token_list_final[:5])

Number of tokens: 564

Some sample tokens: ['in', 'this', 'practical', 'course', 'learn']


## 2. Build n-grams database

In [17]:
from nltk.util import ngrams

#Use a sqlite database to store ngrams information
import sqlite3
conn = sqlite3.connect(":memory:")

#table to store first word, second word and count of occurance
conn.execute('''DROP TABLE IF EXISTS NGRAMS''')
conn.execute('''CREATE TABLE NGRAMS 
         (FIRST   TEXT  NOT NULL,
          SECOND  TEXT  NOT NULL,
          COUNTS  INT   NOT NULL,
         CONSTRAINT PK_GRAMS PRIMARY KEY (FIRST,SECOND));''')

#Generate bigrams
bigrams = ngrams(token_list_final,2)

#Store bigrams in DB
for i in bigrams:
    insert_str="INSERT INTO NGRAMS (FIRST,SECOND,COUNTS) \
          VALUES ('" + i[0] + "','" + i[1] + "',1 ) \
          ON CONFLICT(FIRST,SECOND) DO UPDATE SET COUNTS=COUNTS + 1"   
    conn.execute(insert_str);

#Look at sample data from the table
cursor = conn.execute("SELECT FIRST, SECOND, COUNTS from NGRAMS LIMIT 5")
for gram_row in cursor:
    print("FIRST=", gram_row[0], "SECOND=",gram_row[1],"COUNT=",gram_row[2])

FIRST= in SECOND= this COUNT= 2
FIRST= this SECOND= practical COUNT= 1
FIRST= practical SECOND= course COUNT= 1
FIRST= course SECOND= learn COUNT= 1
FIRST= learn SECOND= how COUNT= 2


## 3. Recommend the next word

In [18]:
#Function to query DB and find next word
def recommend(str):
    nextwords = []
    #Find next words, sort them by most occurance
    cur_filter = conn.execute("SELECT SECOND from NGRAMS \
                              WHERE FIRST='" + str + "' \
                              ORDER BY COUNTS DESC")
    
    #Build a list ordered from most frequent to least frequent next word
    for filt_row in cur_filter:
        nextwords.append(filt_row[0])
    return nextwords

#Recommend for words data and science
print("Next word for data ", recommend("data"))
print("\nNext word for science ", recommend("science"))

Next word for data  ['science', 'analysis', 'data', 'from', 'in', 'mining', 'munging', 'node.js', 'preparation', 'scientists', 'visualization', 'you']

Next word for science  ['begins', 'requires', 'specialists', 'teams']
