<a href="https://colab.research.google.com/github/vadlasushma/Sentiment_Analysis_legal/blob/main/SL_ABSA_classicmodels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# pre-trained vector for w2v
import gensim
import gensim.downloader as api
nlp = api.load('word2vec-google-news-300')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#clean the dataset
def clean_dataset(dataSet):
    assert isinstance(dataSet, pd.DataFrame)
    dataSet.dropna(inplace=True)
    indices_to_keep = ~dataSet.isin([np.nan, np.inf, -np.inf]).any(1)
    return dataSet[indices_to_keep]
# preprocess the dataset
def pre_process(dataset) :
  dataset['Sentence']=dataset['Sentence'].str.lower()
  dataset['Aspect']=dataset['Aspect'].str.lower()
  vocab = dataset['Aspect'].str.lstrip().unique()
  return dataset,vocab

In [None]:
#W2V
#BOW based approaches
nlp.init_sims(replace=True) # calling for using syn0norm

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

# Tokenize, and apply word vector averaging to tokenized text
import nltk
nltk.download('punkt')
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    

X_train_tokenized = train_data['Sentence'].apply(lambda x: w2v_tokenize_text(x)).values
X_test_tokenized = test_data['Sentence'].apply(lambda x: w2v_tokenize_text(x)).values

X_train_word_average = word_averaging_list(nlp,X_train_tokenized)
X_test_word_average = word_averaging_list(nlp,X_test_tokenized)



In [None]:
#def for finding the position of the aspect word in the sentence
import regex as re
def search_pos(sentences,party) :
  temp = sentences
  # print(temp)
  # for i in range(len(aspect)) :
  regex1 = re.compile(r'\b' + party + '\'s' + r'\b', re.IGNORECASE)
  matched1 = regex1.search(temp)
  regex2 = re.compile(r'\b' + party + r'\b', re.IGNORECASE)
  matched2 = regex2.search(temp)
  if matched1 is not None and bool(matched1.group()):
    return matched1.start()
  else :
    if matched2 is not None and bool(matched2.group()):
      return matched2.start()
    
  return 0   
# def for finding if aspect is present in sentence ()
def search(sentences,party) :
  temp = sentences
  # print(temp)
  # for i in range(len(aspect)) :
  regex1 = re.compile(r'\b' + party + '\'s' + r'\b', re.IGNORECASE)
  matched1 = regex1.search(temp)
  regex2 = re.compile(r'\b' + party + r'\b', re.IGNORECASE)
  matched2 = regex2.search(temp)
  if matched1 is not None and bool(matched1.group()):
    return 1
  else :
    if matched2 is not None and bool(matched2.group()):
      return 1
    
  return 0   

In [None]:
#Reading Dataset
# import csv
# train_data = pd.read_csv("/content/drive/My Drive/Legal/Masked-sent-part-strat-nodupes-train.csv")
# test_data =  pd.read_csv("/content/drive/My Drive/Legal/Masked-sent-part-strat-nodupes-test.csv")
import csv
train_data = pd.read_csv("/content/drive/My Drive/Datasets/SABSA-new-augmented-train-after-sen_part-stra.csv")
test_data =  pd.read_csv("/content/drive/My Drive/Datasets/SABSA-new-test-after-sen_part-Stra.csv")
# ,encoding='cp1252'

In [None]:
#Cleaning dataset
train_data =clean_dataset(train_data )
print(train_data)
test_data =clean_dataset(test_data )
print(test_data)
#preprocess dataset
train_data,train_vocab=pre_process(train_data)
print(train_data)
test_data,test_vocab=pre_process(test_data)
print(test_data)
#making vocab by combining vocabs of train and test data
vocab_aspect = np.hstack((train_vocab,test_vocab))
print(len(vocab_aspect))
vocab_aspect=np.unique(vocab_aspect)

In [None]:
# Making of aspect matrix (postion of aspect and if aspect is present in sentence or not)
# 1. position of aspect in sent
l1 = []
for sent in train_data['Sentence']:
  temp = []
  for word in vocab_aspect :
    temp.append(search_pos(sent,word))
  l1.append(temp)
l2 = []
for sent in test_data['Sentence']:
  temp = []
  for word in vocab_aspect :
    temp.append(search_pos(sent,word))
  l2.append(temp)
  # 2. if aspect is present or not in sentence
l3 = []
for sent in train_data['Sentence']:
  temp = []
  for word in vocab_aspect :
    temp.append(search(sent,word))
  l3.append(temp)

l4 = []
for sent in test_data['Sentence']:
  temp = []
  for word in vocab_aspect :
    temp.append(search(sent,word))
  l4.append(temp)  

In [None]:
#lists to dataframes (aspect matrix)
aspect_matrix_train_pos = pd.DataFrame(l1, columns=vocab_aspect)
aspect_matrix_test_pos = pd.DataFrame(l2, columns=vocab_aspect)
aspect_matrix_train = pd.DataFrame(l3, columns=vocab_aspect)
aspect_matrix_test = pd.DataFrame(l4, columns=vocab_aspect)
# lists to dataframes (w2v matrix)
vect2 = pd.DataFrame(X_train_word_average)
vect1 = pd.DataFrame(X_test_word_average)
#concatenate w2v matrix, aspect matrix (position + aspect search), party label column from dataset
X_train = pd.concat([vect2,aspect_matrix_train,aspect_matrix_train_pos,train_data['PartyLabel']], axis=1)
X_test= pd.concat([vect1,aspect_matrix_test,aspect_matrix_test_pos,test_data['PartyLabel']], axis=1)
y_train =train_data['Sentiment']
y_test =test_data['Sentiment']

## Model Evaluation


In [None]:
#logistic regression

logclf = LogisticRegression(solver='liblinear')
logclf.fit(X_train, y_train)
y_pred = logclf.predict(X_test)
print(classification_report(y_test, y_pred))
all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# for roc_plot(model)
fp0 = []
fp1 = []
fp2 = []
tp0 = []
tp1 = []
tp2 = []

In [None]:
#Roc curve
pred = logclf.predict(X_test)
pred_prob = logclf.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

fp0.append(fpr[0])
fp1.append(fpr[1])
tp0.append(tpr[0])
tp1.append(tpr[1])
fp2.append(fpr[2])
tp2.append(tpr[2])
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier 
dtclf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
dtclf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = dtclf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print(classification_report(y_test, y_pred))
# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred = dtclf.predict(X_test)
pred_prob = dtclf.predict_proba(X_test)
# from collections import Counter
# print(Counter(pred))
# # roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
print(fpr[2])    
fp0.append(fpr[0])
fp1.append(fpr[1])
# fp2.append(fpr[2])
tp0.append(tpr[0])
tp1.append(tpr[1])
# tp2.append(tpr[2])
fp2.append(fpr[2])
tp2.append(tpr[2])
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfclf=RandomForestClassifier(n_estimators=100)

rfclf.fit(X_train,y_train)

y_pred = rfclf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print(classification_report(y_test, y_pred))
# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred = rfclf.predict(X_test)
pred_prob = rfclf.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)

fp0.append(fpr[0])
fp1.append(fpr[1])
tp0.append(tpr[0])
tp1.append(tpr[1]) 
fp2.append(fpr[2])
tp2.append(tpr[2])   
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);    

In [None]:
#SVM
from sklearn import svm

#Create a svm Classifier
svmclf = svm.SVC(kernel='linear',probability= True) # Linear Kernel

#Train the model using the training sets
svmclf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svmclf.predict(X_test)
print(classification_report(y_test, y_pred))
# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred = svmclf.predict(X_test)
pred_prob = svmclf.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
    
fp0.append(fpr[0])
fp1.append(fpr[1])
tp0.append(tpr[0])
tp1.append(tpr[1])
fp2.append(fpr[2])
tp2.append(tpr[2])
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   

In [None]:
#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
 
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred = knn.predict(X_test)
pred_prob = knn.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
    
fp0.append(fpr[0])
fp1.append(fpr[1])
tp0.append(tpr[0])
tp1.append(tpr[1])
fp2.append(fpr[2])
tp2.append(tpr[2])
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   

In [None]:
#Gaussian Classifier
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
print(classification_report(y_test, y_pred))
# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#roc curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred = gnb.predict(X_test)
pred_prob = gnb.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
    
fp0.append(fpr[0])
fp1.append(fpr[1])
tp0.append(tpr[0])
tp1.append(tpr[1])
fp2.append(fpr[2])
tp2.append(tpr[2])
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   

In [None]:
#Perceptron
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
# Create a perceptron object with the parameters: 40 iterations (epochs) over the data, and a learning rate of 0.1
ppn = Perceptron(n_iter_no_change=40,eta0=0.1,random_state=0)

# Train the perceptron
ppn.fit(X_train, y_train)
# Apply the trained perceptron on the X data to make predicts for the y test data
y_pred = ppn.predict(X_test)
print(classification_report(y_test, y_pred))
# all_accuracy = []
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_pred)
all_accuracy.append(score)
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix
print(all_accuracy)
# print(fp2)

### Visualizations

In [None]:
classifiers = ['Logisticclf','DecisionTree','RandomForest','SVM','KNN','Naive_bayes','Perceptron']

#BOX PLOT for accuracy of all models
import matplotlib.pyplot as plt
fig = plt.figure(figsize =(10, 5))
ax = fig.add_axes([0,0,1,1])
# c = ['Logisticclf','DecisionTree','RandomForest','SVM','KNN','Naive_bayes']
# a = [0.5966257668711656, 0.48466257668711654, 0.7239263803680982, 0.6794478527607362, 0.5230061349693251, 0.6595092024539877]
ax.bar(classifiers,all_accuracy)
plt.show()
fig = plt.figure(figsize =(7, 5))
 # Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])
 # Creating plot
bp = ax.boxplot(all_accuracy)
plt.title('Box Plot of Accuracy of all models', fontweight='bold', fontsize=15) 
# show plot
plt.show()

In [None]:
#Roc curve of all models for class 0
fig = plt.figure(figsize=(8,6))

for i in range(len(classifiers)-1):
    plt.plot(fp0[i], 
             tp0[i],
             label="{}".format(classifiers[i]))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis for class 0', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')

plt.show()

In [None]:
#Roc Curve for all models for class1
fig = plt.figure(figsize=(8,6))

for i in range(len(classifiers)-1):
    plt.plot(fp1[i], 
             tp1[i],
             label="{}".format(classifiers[i]))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis for class 1', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')

plt.show()

In [None]:
#Roc curve for all models for class2
fig = plt.figure(figsize=(8,6))

for i in range(len(classifiers)-1):
    plt.plot(fp2[i], 
             tp2[i],
             label="{}".format(classifiers[i]))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis for class 1', fontweight='bold', fontsize=15)
plt.legend(prop={'size':10}, loc='lower right')

plt.show()

### Reference of TFIDF Vectorizer code

In [None]:
# # TfidfVectorizer 
# # CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
# import pandas as pd
# def tfidf_columns(dataf) :
 
#   # countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english',vocabulary=vocab)
#   tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
#   # count_wm = countvectorizer.fit_transform(dataf['Sentence'])
#   tfidf_wm = tfidfvectorizer.fit_transform(dataf['Sentence'])
#   # print(count_wm)
#   #retrieve the terms found in the corpora
#   # if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#   #count_tokens = tfidfvectorizer.get_feature_names() # no difference
#   # count_tokens = countvectorizer.get_feature_names()
#   # print(count_tokens)
#   tfidf_tokens = tfidfvectorizer.get_feature_names()
#   # df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
#   df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
#   # print("Count Aspects\n")
#   # print(df_countvect)
#   print("\nTD-IDF Sentences\n")
#   print(df_tfidfvect)
#   return df_tfidfvect

# t1=tfidf_columns(train_data)
# t2=tfidf_columns(test_data)


# # df_countvect['lee'].head(10)
# print(type(t1))
# vocab_train=t1.columns
# new_vocab_train=[x for x in vocab_train if not any(c.isdigit() for c in x)]
# type(vocab_train)
# new_vocab_train.pop(0)
# vocab_test=t2.columns
# new_vocab_test=[x for x in vocab_test if not any(c.isdigit() for c in x)]
# new_vocab_test.pop(0)
# def tfidf_with_vocab(dataf,vocab) :
 
#   # countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english',vocabulary=vocab)
#   tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english',vocabulary=vocab)
#   # count_wm = countvectorizer.fit_transform(dataf['Sentence'])
#   tfidf_wm = tfidfvectorizer.fit_transform(dataf['Sentence'])
#   # print(count_wm)
#   #retrieve the terms found in the corpora
#   # if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#   #count_tokens = tfidfvectorizer.get_feature_names() # no difference
#   # count_tokens = countvectorizer.get_feature_names()
#   # print(count_tokens)
#   tfidf_tokens = tfidfvectorizer.get_feature_names()
#   # df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
#   df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
#   # print("Count Aspects\n")
#   # print(df_countvect)
#   print("\nTD-IDF Sentences\n")
#   print(df_tfidfvect)
#   return df_tfidfvect
# # print(train_vocab)
# vocab_for_tfidf = np.hstack((new_vocab_train,new_vocab_test))
# len(vocab_for_tfidf)
# type(vocab_for_tfidf)
# vocab_for_tfidf=np.unique(vocab_for_tfidf)
# len(vocab_for_tfidf)
# t1=tfidf_with_vocab(train_data,vocab_for_tfidf)
# t2=tfidf_with_vocab(test_data,vocab_for_tfidf)

# # t1=tfidf_with_vocab(train_data,new_vocab_train)
# # t2=tfidf_with_vocab(test_data,new_vocab_test)
# Sentence = 'hi i am gentleman'
# party = 'gentleman'
# search(Sentence,party)