### **POS Tagger Using viterbi Algorithm**

In [None]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

# Dowloading treebank library with universal tags
nltk.download('treebank')
nltk.download('universal_tagset')
corpus = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
from tqdm import tqdm
# Getting list of tagged words
tagged_words = [tup for sent in corpus for tup in sent]
# tokens(words)
tokens = [pair[0] for pair in tagged_words]
# vocabulary
V = set(tokens)
# tags
T = set([pair[1] for pair in tagged_words])
# computing P(w/t)(Emmision_probabilities) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))
def word_given_tag(word, tag, train_bag = tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)

    return (count_w_given_tag, count_tag)
# computing tag given tag: tag2(t2) given tag1 t1), Transition Probability

def t2_given_t1(t2, t1, train_bag = tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)
# creating T x T transition matrix of tags

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)):
        tags_matrix[i, j] = (0.000001+t2_given_t1(t2, t1)[0])/(t2_given_t1(t2, t1)[1]+t*0.000001)    # to avoid zeroes we do smoothing
# convert the matrix to a dataframe
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))
train_bag = tagged_words
T = list(set([pair[1] for pair in train_bag]))
W = list(set([pair[0] for pair in train_bag]))
emmission_matrix = np.zeros((len(W),len(T)))
emmission_dict = pd.DataFrame(emmission_matrix, columns = T, index=W)
w_size = len(W)
def emp(word,tag):
  return (word_given_tag(word, tag)[0]+0.000001)/(word_given_tag(word, tag)[1]+w_size*0.000001)      # to avoid zeroes smoothing is done
for word in W:
  for tag in T:
    emmission_dict.loc[word,tag] = emp(word,tag)


In [None]:
def Viterbi(words):
    viterbi = []          # for each word it stores the best tag

    for key, word in enumerate(words):
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]                           # '.' is considered the start symbol
            else:
                transition_p = tags_df.loc[viterbi[-1], tag]                   # transition is considered from the previous best tag to the current tag

            # compute emission and state probabilities
            if word in W:
              emission_p = emmission_dict.loc[word,tag]
            else:
              emission_p = 0.000001                                            # To tackle words that are not there in the vocabulary of the trained dataset we assign a small value to their emmission probability
            viterbi_probability = emission_p * transition_p
            p.append(viterbi_probability)

        pmax = max(p)                                                          # finding max probability to store the best tag
        # getting state for which probability is maximum
        best_tag = T[p.index(pmax)]
        viterbi.append(best_tag)
    return list(zip(words, viterbi))

### ***Enhanced Sentiment Analyzer***

In [None]:
import nltk
import random
from nltk.corpus import movie_reviews
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
# Downloading the movie_reviews corpus
nltk.download('movie_reviews')


# Loading movie_reviews corpus documents and labels
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)  # Shuffling the documents
train_set, test_set = train_test_split(documents, test_size=0.2, random_state=42)
train_set, val_set = train_test_split(train_set, test_size=0.1, random_state=42)


# creating embeddings of words using word2vec
model = Word2Vec([doc for doc, _ in train_set], vector_size=100)


# embedding on adverbs and adjectives for enhanced sentiment analysis
def filter(doc):
  words =[]
  for word,tag in Viterbi(doc):
    if tag == 'ADJ' or tag =='ADV':
      words.append(word)
  return words


# function to create document embeddings
def document_embedding(doc, model):
    words = [word for word in doc if word in model.wv] #creating a list
    if not words:        return None
    return sum(model.wv[word] for word in words) / len(words)


# Creating feature sets
X_train = [document_embedding(filter(doc), model) for doc, _ in tqdm(train_set)]
X_val = [document_embedding(filter(doc), model) for doc, _ in val_set]
X_test = [document_embedding(filter(doc), model) for doc, _ in test_set]

# Extracting labels
y_train = [category for _, category in train_set]
y_val = [category for _, category in val_set]
y_test = [category for _, category in test_set]

# Removing None values (documents that couldn't be embedded)
X_train = [embedding for embedding in X_train if embedding is not None]
X_val = [embedding for embedding in X_val if embedding is not None]
X_test = [embedding for embedding in X_test if embedding is not None]
y_train = y_train[:len(X_train)]
y_val = y_val[:len(X_val)]
y_test = y_test[:len(X_test)]


# Training an SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Evaluating the classifier on the validation set
val_predictions = svm_classifier.predict(X_val)
accuracy_val = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {accuracy_val:.2f}')
print(classification_report(y_val, val_predictions))

# Evaluating the classifier on the test set
test_predictions = svm_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {accuracy_test:.2f}')
print(classification_report(y_test, test_predictions))