<a href="https://colab.research.google.com/github/sumitdua10/natural-language-processing/blob/master/KNN_Neighbors_Text_Classifier_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
class KNN_NLC_Classifer():
    def __init__(self, k=1, distance_type = 'path'):
        self.k = k
        self.distance_type = distance_type

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # This function runs the K(1) nearest neighbour algorithm and
    # returns the label with closest match. 
    def predict(self, x_test):
        self.x_test = x_test
        y_predict = []

        for i in range(len(x_test)):
            max_sim = 0
            max_index = 0
            for j in range(self.x_train.shape[0]):
                temp = self.document_similarity(x_test[i], self.x_train[j])
                if temp > max_sim:
                    max_sim = temp
                    max_index = j
            y_predict.append(self.y_train[max_index])
        return y_predict


    def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None


    def doc_to_synsets(self, doc):
        """
            Returns a list of synsets in document.
            Tokenizes and tags the words in the document doc.
            Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets
        """
        tokens = word_tokenize(doc+' ')
        
        l = []
        tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
        
        for token, tag in zip(tokens, tags):
            syntag = self.convert_tag(tag[1])
            syns = wn.synsets(token, syntag)
            if (len(syns) > 0):
                l.append(syns[0])
        return l  
    

    def similarity_score(self, s1, s2, distance_type = 'path'):
          """
          Calculate the normalized similarity score of s1 onto s2
          For each synset in s1, finds the synset in s2 with the largest similarity value.
          Sum of all of the largest similarity values and normalize this value by dividing it by the
          number of largest similarity values found.

          Args:
              s1, s2: list of synsets from doc_to_synsets

          Returns:
              normalized similarity score of s1 onto s2
          """
          s1_largest_scores = []

          for i, s1_synset in enumerate(s1, 0):
              max_score = 0
              for s2_synset in s2:
                  if distance_type == 'path':
                      score = s1_synset.path_similarity(s2_synset, simulate_root = False)
                  else:
                      score = s1_synset.wup_similarity(s2_synset)                  
                  
                      #score = s1_synset.jcn_similarity(s2_synset, genesis_ic)
                      #score = s1_synset.jcn_similarity(s2_synset, brown_ic)
                      #score = s1_synset.lin_similarity(s2_synset, semcor_ic)
                      #score = s1_synset.res_similarity(s2_synset, genesis_ic)
                      #score = s1_synset.res_similarity(s2_synset, brown_ic)
                      #score = s1_synset.wup_similarity(s2_synset)
                      #score = s1_synset.lch_similarity(s2_synset)
                      #score = s1_synset.wup_similarity(s2_synset)

                  if score != None:
                      if score > max_score:
                          max_score = score
              
              if max_score != 0:
                  s1_largest_scores.append(max_score)
          
          mean_score = np.mean(s1_largest_scores)
                 
          return mean_score  
        
        
    def document_similarity(self,doc1, doc2):
          """Finds the symmetrical similarity between doc1 and doc2"""

          synsets1 = self.doc_to_synsets(doc1)
          synsets2 = self.doc_to_synsets(doc2)
          
          return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2

In [3]:
doc1 = 'I like rains'
doc2 = 'I like showers'
x = KNN_NLC_Classifer()
print("Test Similarity Score: ", x.document_similarity(doc1, doc2))

Test Similarity Score:  0.6946386946386947


In [0]:


synonyms = []
antonyms = []
#print(wordnet.synsets("shower"))
for syn in wordnet.synsets("shower"):
    #print(syn)
    for l in syn.lemmas():
        synonyms.append(l.name())

print(synonyms)



['shower', 'shower', 'shower_bath', 'shower', 'rain_shower', 'shower', 'cascade', 'exhibitor', 'exhibitioner', 'shower', 'shower', 'lavish', 'shower', 'shower', 'shower', 'shower', 'shower_down', 'shower']


In [4]:
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "https://raw.githubusercontent.com/watson-developer-cloud/natural-language-classifier-nodejs/master/training/weather_data_train.csv"          

dataset = pd.read_csv(FILENAME, header = None)

dataset.rename(columns = {0:'text', 1:'answer'}, inplace = True)

dataset['output'] = np.where(dataset['answer'] == 'temperature', 1,0)
Num_Words = dataset.shape[0]

print(dataset.head())
print("\nSize of input file is ", dataset.shape)


                            text       answer  output
0           How hot is it today?  temperature       1
1             Is it hot outside?  temperature       1
2  Will it be uncomfortably hot?  temperature       1
3         Will it be sweltering?  temperature       1
4          How cold is it today?  temperature       1

Size of input file is  (50, 3)


In [17]:
import re
nltk.download('stopwords')
s = stopwords.words('english')
#add additional stop words
s.extend(['today', 'tomorrow', 'outside', 'out', 'there'])

ps = nltk.wordnet.WordNetLemmatizer()
for i in range(dataset.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', dataset.loc[i,'text'])
    review = review.lower()
    review = review.split()

    review = [ps.lemmatize(word) for word in review if not word in s]
    review = ' '.join(review)
    dataset.loc[i, 'text'] = review

X_train = dataset['text']
y_train = dataset['output']

print("Below is the sample of training text after removing the stop words")
print(dataset['text'][:10])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo

In [23]:

# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
classifier.fit(X_train, y_train)

final_test_list = ['will it rain', 'Is it hot outside?' , 'What is the expected high for today?' , 
                   'Will it be foggy tomorrow?', 'Should I prepare for sleet?',
                     'Will there be a storm today?', 'do we need to take umbrella today',
                    'will it be wet tomorrow', 'is it humid tomorrow', 'what is the precipitation today',
                    'is it freezing outside', 'is it cool outside', "are there strong winds outside",]
                 
test_corpus = []
for i in range(len(final_test_list)):
    review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
    review = review.lower()
    review = review.split()

    review = [ps.lemmatize(word) for word in review if not word in s]
    review = ' '.join(review)
    test_corpus.append(review)

y_pred_final = classifier.predict(test_corpus)

output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
output_df['answer'] = np.where(output_df['code']==1, 'Temperature','Conditions')
print(output_df)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


    code                                  text       answer
0      0                          will it rain   Conditions
1      1                    Is it hot outside?  Temperature
2      1  What is the expected high for today?  Temperature
3      1            Will it be foggy tomorrow?  Temperature
4      0           Should I prepare for sleet?   Conditions
5      0          Will there be a storm today?   Conditions
6      0     do we need to take umbrella today   Conditions
7      0               will it be wet tomorrow   Conditions
8      1                  is it humid tomorrow  Temperature
9      1       what is the precipitation today  Temperature
10     1                is it freezing outside  Temperature
11     1                    is it cool outside  Temperature
12     0        are there strong winds outside   Conditions


In [0]:



# 7. Transform the test data; Predict the Test Data and Calculate the score
X_final_test_vectorized = vect.transform(test_corpus)
#y_pred_final = classifier.predict_proba(X_final_test_vectorized)
perc_0, perc_1 = zip(*y_pred_final)
print(y_pred_final[0])

output_dict = {1:'Temprature', 0: 'Condition'}
output_df = pd.DataFrame(data = {'text': final_test_list, 'perc_0': perc_0, 'perc_1':perc_1})
print(output_df)
output_df['answer'] = np.where(output_df['perc_0'] > 0.5, 'Conditions', 'Temperature')
output_df['perc'] = np.where(output_df['answer']=='Temperature', output_df['perc_1'] * 100, output_df['perc_0'] * 100)

output_df.drop(['perc_0', 'perc_1'], inplace=True, axis=1)
print(output_df.head())
print(y_pred_final)

#score = roc_auc_score(y_test,y_pred_decision )
#print("Accuracy Score (AUC) is ", score)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Max Score for  rain  is  1.0 .
 Closest Match is  rain  index 25
Max Score for  hot  is  1.0 .
 Closest Match is  hot  index 0
Max Score for  expected high  is  1.0 .
 Closest Match is  expected high  index 8
Max Score for  foggy  is  0 .
 Closest Match is  hot  index 0
Max Score for  prepare sleet  is  0.731578947368421 .
 Closest Match is  wind dangerous  index 42
Max Score for  storm  is  0.8235294117647058 .
 Closest Match is  wind dangerous  index 42
Max Score for  umbrella  is  0.4122807017543859 .
 Closest Match is  see sun  index 36
Max Score for  rain  is  1.0 .
 Closest Match is  rain  index 25
Max Score for  need take umbrella  is  0.49090909090909085 .
 Closest Match is  expecting sunny condition  index 28
Max Score for  wet  is  0.6845238095238095 .
 Closest Match is  expected humidity  index 47
Max Score for  shower  is  0.4632352941176471 .
 Closest Match is  see sun  index 36
Max Score for  humid  is  1.0 .
 Closest Match is  humid  index 46
Max Score for  precipitation

TypeError: ignored

In [0]:




# Cleaning the texts

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
s = stopwords.words('english')
print(len(s))

#print(type(s))
s.remove('not')
print(len(s))
#review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])


#review = review.lower()
#review = review.split()
#print(review)

#ps = PorterStemmer()
#review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]

#review = 'a '.join(review)
#print(review)



#corpus.append(review)
#print(corpus)

corpus = []
for i in range(0, Num_Words):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    #review = review.lower()
    #review = review.split()
    #ps = PorterStemmer()
    #review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #review = ' '.join(review)
    corpus.append(review)

print(len(corpus))
#print(corpus.size)
#print(corpus.shape)
#print(corpus[0])
#corpus.remove((corpus[0]))

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (1,3)).fit(corpus)
fnames = cv.get_feature_names()
print(len(fnames))
X = cv.transform(corpus).toarray()
print(X.shape)
print(type(X))
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set
import sklearn.naive_bayes as nb
import sklearn.linear_model as lm

#classifier =  lm.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
#classifier = nb.BernoulliNB()
classifier = nb.MultinomialNB()
#classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
#y_dec = classifier.decision_function(X_test)
#y_prob = classifier.predict_proba(X_test)
#df = pd.DataFrame(X_test)
#df[1:2] = y_prob
#print(df.head())
#df.to_csv(TestFile)

#print(y_prob.shape)
print(y_pred.shape)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)


from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))


#inputtext = input("Enter the question")
inputtext = "food was not good"
review = re.sub('[^a-zA-Z]', ' ', inputtext)
review = review.lower()
#review = review.split()
print(review)
#review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
#review = ' '.join(review)

print(review)


x = list()
x = [review]
print(x)
#list.append(review)
#corpus.append(review)
#print(len(corpus))
#print(corpus[-1])
#cv = CountVectorizer()#max_features = 1500)
#X_test_run = cv.fit_transform(corpus).toarray()

x_new = cv.transform(x).toarray()
print(x_new.shape)
#print(X_test_run.shape)
y = classifier.predict(x_new)
print("Classifier Response is ",y[-1])

