In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from textblob import TextBlob

In [None]:
def gaussianNaiveBayes(X, Y):

    # Split in train and test
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=0)
    classifier = GaussianNB()
    classifier.fit(xTrain, yTrain)

    # Make predictions
    yPred = classifier.predict(xTest)

    # Confusion Matrix and accuracy
    matrix = confusion_matrix(y_true=yTest, y_pred=yPred)
    accuracy = accuracy_score(yPred, yTest)

    # Precision, Recall and F-Score
    fScore = f1_score(yTest, yPred, average="macro")
    precision = precision_score(yTest, yPred, average="macro")
    recall = recall_score(yTest, yPred, average="macro")
    report = classification_report(yTest, yPred)


    return [matrix, accuracy, fScore, precision, recall, report]



def MultiLayerPerceptron(X, Y):

    # Split in train and test
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=0)

    # Fit the classifier
    classifier = MLPClassifier(alpha=10.0 ** -1, hidden_layer_sizes=(100,150), max_iter=100)
    classifier.fit(xTrain, yTrain)

    # Make predictions
    yPred = classifier.predict(xTest)

    # Confusion Matrix and accuracy
    matrix = confusion_matrix(y_true=yTest, y_pred=yPred)
    accuracy = accuracy_score(yPred, yTest)

    # Precision, Recall and F-Score
    fScore = f1_score(yTest, yPred, average="macro")
    precision = precision_score(yTest, yPred, average="macro")
    recall = recall_score(yTest, yPred, average="macro")
    report = classification_report(yTest, yPred)
    return [matrix, accuracy, fScore, precision, recall, report]


def SVM(X, Y):

    # Split in train and test
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=0)

    # Cross Validation
    # print("SVM Cross Validation!")
    # svm = SVC()
    # parameters = {'kernel': ['linear'], 'C': (1, 10)
    #     , 'gamma': ('auto', 'scale')
    #     # , 'gamma': (0.001, 0.01, 0.1, 1, 2, 3, 'auto')
    #     ,'decision_function_shape': ('ovo', 'ovr')
    #     # , 'shrinking': (True, False)
    #               }
    # clf = GridSearchCV(svm, parameters,scoring="accuracy",
    #                           cv=10,
    #                           n_jobs=8, verbose=10)
    # clf.fit(xTrain, yTrain)
    # bestAccuracy = clf.best_score_
    # bestParameters = clf.best_params_
    # print("The best parameters for MLP model are :\n{}\n".format(bestParameters))
    # print(bestAccuracy)


    # {'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'linear'}
    # # Fit the classifier
    classifier =SVC(C=1, kernel='linear', decision_function_shape='ovo', gamma='auto')
    classifier.fit(xTrain, yTrain)

    # Make predictions
    yPred = classifier.predict(xTest)

    # Confusion Matrix and accuracy
    matrix = confusion_matrix(y_true=yTest, y_pred=yPred)
    accuracy = accuracy_score(yPred, yTest)

    # Precision, Recall and F-Score
    fScore = f1_score(yTest, yPred, average="macro")
    precision = precision_score(yTest, yPred, average="macro")
    recall = recall_score(yTest, yPred, average="macro")
    report = classification_report(yTest, yPred)
    return [matrix, accuracy, fScore, precision, recall, report]

def trainBestClassifier(X, Y):


    # {'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'linear'}
    # Fit the classifier
    classifier =SVC(C=1, kernel='linear', decision_function_shape='ovo', gamma='auto')
    classifier.fit(X, Y)

    return classifier

In [None]:
def aspectAnalysis(df, output=False):

    count = 0
    filteredWordsList = []

    for row, aspect in zip(df['tagged_words'], df['aspect_term']):

        # Variables to store left and right windows
        leftPart = []
        rightPart = []

        aspectSplit = word_tokenize(aspect)
        aspectTermsLen = len(aspectSplit)

        # Can change the window size
        windowSize = 10

        # Find the aspect term's first word's index in row
        for i in range(len(row)):
            if aspectSplit[0] == row[i][0]:
                # print('Matched Word is ', row[i][0])
                aspectIndex = i
                break

        # Variable to decrement the window size dynamically
        # if sentence does not have enough words to fit in the window
        windowNotAssigned = True

        while windowNotAssigned:

            # Best Case : When the window fits both left and right sides
            if (aspectIndex - (windowSize//2) >= 0) and (aspectIndex + aspectTermsLen + (windowSize - (windowSize//2)) < len(row)):
                leftPart = row[(aspectIndex - (windowSize//2)) : aspectIndex]
                rightPart = row[aspectIndex + aspectTermsLen : (aspectIndex + (windowSize - (windowSize//2)))]

                windowNotAssigned = False

            # Case when right side doesn't fit in window
            elif (aspectIndex - (windowSize//2) >= 0) and (aspectIndex + aspectTermsLen + (windowSize - (windowSize//2)) >= len(row)):
                rightPart = row[aspectIndex + aspectTermsLen : ]
                missingRightLen = (windowSize//2) - len(rightPart)

                # Check if we can accomodate the missing right part on left side
                if (aspectIndex - (windowSize//2) - missingRightLen) >= 0:
                    leftPart = row[(aspectIndex - (windowSize//2) - missingRightLen) : aspectIndex]
                else:
                    leftPart = row[: aspectIndex]

                windowNotAssigned = False

            # Case when left side doesn't fit the window
            elif (aspectIndex - (windowSize//2) < 0) and (aspectIndex + aspectTermsLen + (windowSize - (windowSize//2)) < len(row)):
                leftPart = row[0 : aspectIndex]
                missingLeftLen = (windowSize//2) - len(leftPart)

                # Check if we can accomodate the missing left part on right side
                if (aspectIndex + aspectTermsLen + (windowSize - (windowSize//2)) + missingLeftLen) < len(row):
                    rightPart = row[aspectIndex + aspectTermsLen : (aspectIndex + (windowSize - (windowSize//2)) + missingLeftLen)]
                else:
                    rightPart = row[aspectIndex + aspectTermsLen :]

                windowNotAssigned = False

            # Worst case : When not enough words on both left and right sides of aspect term
            # Decrement the window size and try again
            else:

                windowSize -= 1

        filteredWords = leftPart + rightPart
        # print(count)
        # print(filteredWords)
        filteredWordsList.append(filteredWords)
        count += 1

    # Create a column with the important words around the aspect term with the window size
    filteredWordsList = pd.Series(filteredWordsList)
    df['important_words'] = filteredWordsList.values

    # Split the words as sentence in df[]
    def splitWords(x):

        s = [i[0] for i in x]
        return ' '.join(s)

    # df['important_words'] = df['important_words'].apply(lambda x : splitWords(x))
    df['important_words'] = df['important_words'].apply(lambda x : splitWords(x)) + ' ' + df['aspect_term']

    # Define a corpus for the Bag of Words Model
    corpus = list()
    for x in df['important_words']:
        corpus.append(x)

    # Bag of Words
    # cv = CountVectorizer(max_features=20000)
    # TF-IDF
    cv = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    overall_sentiment = [TextBlob(sentence).sentiment.polarity for sentence in df['text']]

    # Adding overall sentiment
    X = np.concatenate(
        ((cv.fit_transform(corpus).toarray()), np.asarray(overall_sentiment).reshape(len(overall_sentiment), 1)), 1)
    Y = None
    if not output:
        Y = df.iloc[:, 4].values
    return df, X, Y

In [None]:
def returnDatasetInfo(df):

    # Return the basic structure info about the dataset
    print("Shape \n{}\n\n".format(df.shape))
    print("Info \n{}\n\n".format(df.info()))
    print("Description \n{}\n\n".format(df.describe()))
    print("Missing values check \n{}\n\n".format(df.isnull().any()))

In [None]:
df=pd.read_csv('/content/English_final.csv')
df['example_id']=df['Unnamed: 0']
df.drop(['Unnamed: 0'],axis=1,inplace=True)
#returnDatasetInfo(df)

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
def entity_sentence_pair(data):
  sent_lis=[]
  ent_lis=[]
  ent_loc=[]
  example_id=[]
  count=0
  for i in range(len(data)):
    text=data['Tweet'][i]
    doc=nlp(text)
    if(len(doc.ents)==0):
      count+=1
    for ent in doc.ents:
      if(ent.label_=='ORG'or ent.label_=='PRODUCT'):
        sent_lis.append(data['Tweet'][i])
        ent_lis.append(ent.text)
        ent_loc.append(str(ent.start_char)+'--'+str(ent.end_char))
        example_id.append(data['example_id'][i])
  #print(count)
  df=pd.DataFrame()
  df['example_id']=example_id
  df['text']=sent_lis
  df['aspect_term']=ent_lis
  df['term_location']=ent_loc
  return(df)      

In [None]:
data=entity_sentence_pair(df)

In [None]:
def preprocessData(df):

    # Remove unnecessary stuff
    for x in df['text']:
        x = re.sub('[^a-zA-Z]', ' ', x)
    for x in df['aspect_term']:
        x = re.sub('[^a-zA-Z]', ' ', x)

    # Make all the capital letters small
    df['text'] = df['text'].str.lower()
    df['aspect_term'] = df['aspect_term'].str.lower()

    # Remove [comma] from the column df[' text']
    df['text'] = df['text'].replace("comma", "", regex=True)
    df['text'] = df['text'].replace("\[]", "", regex=True)

    # Remove [comma] from the column df['aspect_term']
    df['aspect_term'] = df['aspect_term'].replace("comma", "", regex=True)
    df['aspect_term'] = df['aspect_term'].replace("\[]", "", regex=True)

    # Remove _ from the text
    df['text'] = df['text'].replace('_', '', regex=True)
    df['aspect_term'] = df['aspect_term'].replace('_', '', regex=True)

    # Remove special characters from text
    df['text'] = df['text'].apply(lambda x: re.sub('\W+', ' ', x))
    df['aspect_term'] = df['aspect_term'].apply(lambda x: re.sub('\W+', ' ', x))

    # Remove the stop words
    # nltk.download()
    stopWords = set(stopwords.words("english"))
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in (stopWords)]))

    # Tag the words
    # Each word is tagged with its type eg. Adjective, Noun, etc
    # Chunk them together and return
    def tagWords(sentence):
        words = word_tokenize(sentence)
        tagged = nltk.pos_tag(words)
        return tagged

    df['tagged_words'] = df['text'].apply(lambda row: tagWords(row))

    return df

In [None]:
nltk.download('averaged_perceptron_tagger')
data=preprocessData(data)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
aspectAnalysis(data)

IndexError: ignored

In [None]:
# Read two train datasets
    df_comp_in = pd.read_csv('data/data-2_train.csv', sep='\s*,\s*')
    df_comp_out = pd.read_csv('data/Data-2_test.csv', sep='\s*,\s*')
    
    # Your output file name
    outFile = "/content/output.txt"

    df_comp_out['class'] = np.ones(len(df_comp_out))

    df = pd.concat([df_comp_in, df_comp_out])
    df = preprocessData(df)
    
    df, X, Y = aspectAnalysis(df)
    X_train = X[0:len(df_comp_in)]
    Y_train = Y[0:len(df_comp_in)]
    X_test = X[len(df_comp_in):]
    
    # Classifier
    classfier_comp = trainBestClassifier(X_train, Y_train)
    Y_test = classfier_comp.predict(X_test)
    printOutput(df_comp_out, Y_test, outFile)