In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#plt.style.use('seaborn')
import nltk # natural language tool-kit
import re

In [2]:
data= pd.read_fwf('F://transcript.txt',header=None,names=['id','text','unknown'])   #fwf--> fixed with formatted lines

In [3]:
data.head()

Unnamed: 0,id,text,unknown
0,33_1_0001,okay let's see i want to go to a thai restaura...,
1,33_1_0002,<i> <like> <to> <eat> [uh] i like to eat at lu...,
2,33_1_0003,i don't want to walk for more than five minutes,
3,33_1_0004,tell me more about the [uh] na- nakapan [uh] r...,
4,33_1_0005,i like to go to a hamburger restaurant,


In [4]:
data.drop('unknown',inplace=True,axis=1)

In [5]:
data.head(3)

Unnamed: 0,id,text
0,33_1_0001,okay let's see i want to go to a thai restaura...
1,33_1_0002,<i> <like> <to> <eat> [uh] i like to eat at lu...
2,33_1_0003,i don't want to walk for more than five minutes


### text cleaning:
- lower case
- remove numbers
- remove spl characters
- take root word (love,loving,loved in this 3 words root word is Love(ignoring present past future singular prural etc...))
    - from this we need to do stemming and lemmatization

In [6]:
## Example

In [7]:
text= 'Innomatics Research Labs, Located in Nizampet X road, PIN: 500085'

In [8]:
#step-----1
pre_text= text.lower()
print(pre_text)
## step-----2---> special characters and NUmbers

#re.findall(r'\W',pre_text)
pre_text =  re.sub(r'[^a-z]',' ',pre_text)

pre_text

innomatics research labs, located in nizampet x road, pin: 500085


'innomatics research labs  located in nizampet x road  pin        '

In [9]:
ps= nltk.stem.PorterStemmer()   # stemmer
word= 'loving'
ps.stem(pre_text)

'innomatics research labs  located in nizampet x road  pin        '

In [10]:
pre_text.strip().split(' ')

['innomatics',
 'research',
 'labs',
 '',
 'located',
 'in',
 'nizampet',
 'x',
 'road',
 '',
 'pin']

In [11]:
#or 
from nltk import word_tokenize


In [12]:
words= word_tokenize(pre_text)
print(words)

['innomatics', 'research', 'labs', 'located', 'in', 'nizampet', 'x', 'road', 'pin']


In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Home
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
' '.join([ps.stem(word) for word in words]) ## 

'innomat research lab locat in nizampet x road pin'

In [15]:
data

Unnamed: 0,id,text
0,33_1_0001,okay let's see i want to go to a thai restaura...
1,33_1_0002,<i> <like> <to> <eat> [uh] i like to eat at lu...
2,33_1_0003,i don't want to walk for more than five minutes
3,33_1_0004,tell me more about the [uh] na- nakapan [uh] r...
4,33_1_0005,i like to go to a hamburger restaurant
5,33_1_0006,let's start again
6,33_1_0007,i like to get a hamburger at an american resta...
7,33_1_0008,i'd like to eat dinner . and i don't mind walk...
8,33_1_0009,i don't want to spend more than [uh] ten dolla...
9,33_1_0010,<(te)-ll> <me> <more> <about> <the> <two> <bar...


In [16]:
# text process or text Normalization----> nothing but text cleaning

In [17]:
def textnormalization(text):
    #step-----1
    pre_text= text.lower()
    ## step-----2---> special characters and NUmbers
    pre_text =  re.sub(r'[^a-z ]','',pre_text)
    # step-3---> Tokenization
    words = word_tokenize(pre_text) # to get words
    # step-4---> stemming
    sent = ' '.join([ps.stem(word) for word in words])
    return sent

In [18]:
data['text_norm']= data['text'].apply(textnormalization)
data.head()

Unnamed: 0,id,text,text_norm
0,33_1_0001,okay let's see i want to go to a thai restaura...,okay let see i want to go to a thai restaur uh...
1,33_1_0002,<i> <like> <to> <eat> [uh] i like to eat at lu...,i like to eat uh i like to eat at lunch time s...
2,33_1_0003,i don't want to walk for more than five minutes,i dont want to walk for more than five minut
3,33_1_0004,tell me more about the [uh] na- nakapan [uh] r...,tell me more about the uh na nakapan uh restau...
4,33_1_0005,i like to go to a hamburger restaurant,i like to go to a hamburg restaur


In [19]:
data.isnull().sum()

id           0
text         0
text_norm    0
dtype: int64

In [20]:
# drop if any missing values 

### N-gram Data preperation:

In [21]:
from nltk import ngrams, bigrams, trigrams # natural language tool kit

In [22]:
#taking 1st sentence and analysis
sent1 = data['text_norm'][0]

In [23]:
tokens = word_tokenize(sent1)   # get words in a list
ndata=list(ngrams(tokens,2,   #2--> bigram, 3-> is trigram and we can choose any number 
           pad_left=True,   # this like table (ref:notes)
           pad_right=True,
           left_pad_symbol='<sos>',      #sos--> start of sentence
           right_pad_symbol='</sos>'))         

In [24]:
pd.DataFrame(ndata).head()   # convert it into dataframe

Unnamed: 0,0,1
0,<sos>,okay
1,okay,let
2,let,see
3,see,i
4,i,want


In [25]:
def bigrams (sent,ng=2):
    try:
        tokens = word_tokenize(sent)#get words in a list
        return list(ngrams(tokens,n=ng,pad_left=True,
                          pad_right=True,
                          left_pad_symbol='<sos>',right_pad_symbol='</sos>'))
    except:
        pass #skip if any error occurs

In [26]:
corpus = list(data['text_norm'])
corpus

['okay let see i want to go to a thai restaur uh with less than ten dollar per person',
 'i like to eat uh i like to eat at lunch time so that would be eleven am to one pm',
 'i dont want to walk for more than five minut',
 'tell me more about the uh na nakapan uh restaur on martin luther king',
 'i like to go to a hamburg restaur',
 'let start again',
 'i like to get a hamburg at an american restaur',
 'id like to eat dinner and i dont mind walk uh for half an hour',
 'i dont want to spend more than uh ten dollar for a hamburg',
 'tell me more about the two barbecu restaur tell me more about the two barbecu restaur you list',
 'tell me about everett and jone barbecu flint barbecu and the thai barbecu pleas',
 'where the best place to get soup in berkeley',
 'where the best place to get soup in berkeley for lunch for under ten dollar uh with a ten minut walk',
 'what type of food do you understand',
 'i want indian food and i have chuck car so i can drive in but i onli want to drive fo

In [27]:
bi = list(map(bigrams,corpus))

In [28]:
bi

[[('<sos>', 'okay'),
  ('okay', 'let'),
  ('let', 'see'),
  ('see', 'i'),
  ('i', 'want'),
  ('want', 'to'),
  ('to', 'go'),
  ('go', 'to'),
  ('to', 'a'),
  ('a', 'thai'),
  ('thai', 'restaur'),
  ('restaur', 'uh'),
  ('uh', 'with'),
  ('with', 'less'),
  ('less', 'than'),
  ('than', 'ten'),
  ('ten', 'dollar'),
  ('dollar', 'per'),
  ('per', 'person'),
  ('person', '</sos>')],
 [('<sos>', 'i'),
  ('i', 'like'),
  ('like', 'to'),
  ('to', 'eat'),
  ('eat', 'uh'),
  ('uh', 'i'),
  ('i', 'like'),
  ('like', 'to'),
  ('to', 'eat'),
  ('eat', 'at'),
  ('at', 'lunch'),
  ('lunch', 'time'),
  ('time', 'so'),
  ('so', 'that'),
  ('that', 'would'),
  ('would', 'be'),
  ('be', 'eleven'),
  ('eleven', 'am'),
  ('am', 'to'),
  ('to', 'one'),
  ('one', 'pm'),
  ('pm', '</sos>')],
 [('<sos>', 'i'),
  ('i', 'dont'),
  ('dont', 'want'),
  ('want', 'to'),
  ('to', 'walk'),
  ('walk', 'for'),
  ('for', 'more'),
  ('more', 'than'),
  ('than', 'five'),
  ('five', 'minut'),
  ('minut', '</sos>')],
 [('<s

In [29]:
len(bi)

8566

In [30]:
print(bi[1]+bi[1])

[('<sos>', 'i'), ('i', 'like'), ('like', 'to'), ('to', 'eat'), ('eat', 'uh'), ('uh', 'i'), ('i', 'like'), ('like', 'to'), ('to', 'eat'), ('eat', 'at'), ('at', 'lunch'), ('lunch', 'time'), ('time', 'so'), ('so', 'that'), ('that', 'would'), ('would', 'be'), ('be', 'eleven'), ('eleven', 'am'), ('am', 'to'), ('to', 'one'), ('one', 'pm'), ('pm', '</sos>'), ('<sos>', 'i'), ('i', 'like'), ('like', 'to'), ('to', 'eat'), ('eat', 'uh'), ('uh', 'i'), ('i', 'like'), ('like', 'to'), ('to', 'eat'), ('eat', 'at'), ('at', 'lunch'), ('lunch', 'time'), ('time', 'so'), ('so', 'that'), ('that', 'would'), ('would', 'be'), ('be', 'eleven'), ('eleven', 'am'), ('am', 'to'), ('to', 'one'), ('one', 'pm'), ('pm', '</sos>')]


In [31]:
from functools import reduce

In [32]:
def combine(a,b):
    return a+b

In [33]:
data1= reduce(combine,bi)

In [34]:
df= pd.DataFrame(data1,columns=['w1','w2'])
df.head()

Unnamed: 0,w1,w2
0,<sos>,okay
1,okay,let
2,let,see
3,see,i
4,i,want


### PREDICTIONS:

In [35]:
#given 1st word is <sos> what is next word
df[df['w1']=='<sos>'].head()

Unnamed: 0,w1,w2
0,<sos>,okay
20,<sos>,i
42,<sos>,i
53,<sos>,tell
68,<sos>,i


In [36]:
#preparing data by adding count column
df['count']=df['w2']
df.head(3)

Unnamed: 0,w1,w2,count
0,<sos>,okay,okay
1,okay,let,let
2,let,see,see


In [37]:
model_next = df.groupby(by=['w1','w2']).count()
model_next.reset_index(inplace=True)
model_next

Unnamed: 0,w1,w2,count
0,<sos>,a,29
1,<sos>,about,49
2,<sos>,actual,14
3,<sos>,addison,1
4,<sos>,african,2
5,<sos>,after,2
6,<sos>,again,2
7,<sos>,ah,2
8,<sos>,alacart,1
9,<sos>,all,6


In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
given = '<sos>'  # given word
model = model_next[model_next['w1']== given]  # filtered list
den= model['count'].sum()  # counting given word in the corpus
model['probability']= model['count']/den    # calculating the prob of each word
# arrange in decesing order
m1= model.sort_values(by='probability',ascending=False).head()
m1.set_index('w1',inplace=True)
print(m1[['w2','probability']])

          w2  probability
w1                       
<sos>      i     0.226710
<sos>     id     0.061873
<sos>  start     0.036423
<sos>   tell     0.035022
<sos>   what     0.028251


In [43]:
# model_next: consists of all information
def nextword(given):
    #given = '<sos>'  # given word
    model = model_next[model_next['w1']== given]  # filtered list
    den= model['count'].sum()  # counting given word in the corpus
    model['probability']= model['count']/den    # calculating the prob of each word
# arrange in decesing order
    m1= model.sort_values(by='probability',ascending=False).head()
    m1.set_index('w1',inplace=True)
    return m1[['w2','probability']]

### UI

In [44]:
while True:
    given = input('USER: ')
    if given == 'quit':
        break
    else:
        table = nextword(given)
        print(table)

USER: 
Empty DataFrame
Columns: [w2, probability]
Index: []
USER: 
Empty DataFrame
Columns: [w2, probability]
Index: []
USER: 
Empty DataFrame
Columns: [w2, probability]
Index: []
USER: hi
     w2  probability
w1                  
hi    i     0.400000
hi   im     0.342857
hi   id     0.114286
hi  can     0.057143
hi   uh     0.057143
USER: hello
               w2  probability
w1                            
hello      </sos>     0.321429
hello           i     0.285714
hello        berp     0.071429
hello          im     0.071429
hello  loudbreath     0.071429
USER: what
         w2  probability
w1                      
what  about     0.246201
what     is     0.130699
what     do     0.097264
what   kind     0.066869
what    are     0.045593
USER: how
        w2  probability
w1                     
how  about     0.598174
how   much     0.127854
how    far     0.095890
how   mani     0.041096
how    can     0.022831
USER: if
      w2  probability
w1                   
if     i     0.375