In [36]:
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
# https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

In [37]:
import sys
import random
import os
from pathlib import Path
import shutil
import json
import pandas as pd
from collections import defaultdict
import re

import numpy as np
import argparse
import tqdm
import spacy
from spacy.gold import minibatch
from spacy.language import Language
from spacy import util

In [38]:
current_path = os.getcwd()+'/'
dataset_path =  current_path+'Collected Datasets/'

In [39]:
#this function reads the csv files saved in the directory
df_train = pd.read_csv(dataset_path+'train.csv') 
df_dev = pd.read_csv(dataset_path+'devel.csv')
df_test = pd.read_csv(dataset_path+'test.csv')

In [40]:
#this function combines all 3 datasets
df_all = df_train.append(df_test.append(df_dev))

In [41]:
#this function removes all the punctuation and takes care of all the word
pattern = re.compile(r"[A-Za-z0-9\-]{3,50}")

In [42]:
#this function clean all the symbols and removes them from the text
df_all['clean_message'] = df_all['message'].str.findall(pattern).str.join(' ')

In [43]:
#this displays the message with the emotion
df_all = df_all[['clean_message', 'emotion']]
df_all.sample(n=3)

Unnamed: 0,clean_message,emotion
1681,DeionSandersJr DeionSanders bad Slash prices s...,fear
2406,November canola lost 464 per tonne,sadness
149,Iniesta still the one player idolize from Barc...,joy


In [44]:
#this function takes the message and emotion as a tuple i.e. ()
df_all['tuples'] = df_all.apply(lambda row: (row['clean_message'],row['emotion']), axis=1)
train =df_all['tuples'].tolist()
train[:10]

[('How the Who the heck moved fridge should knock the landlord door angry mad',
  'anger'),
 ('Indian Uber driver just called someone the word wasn moving vehicle have jumped out disgusted',
  'anger'),
 ('DPD asked for parcel delivered pick store not address fuming poorcustomerservice',
  'anger'),
 ('whichever butt wipe pulled the fire alarm davis was sound asleep pissed angry upset tired sad tired hangry',
  'anger'),
 ('Don join BTCare they put the phone down you talk over you and are rude Taking money out acc willynilly fuming',
  'anger'),
 ('blood boiling', 'anger'),
 ('When you still got whole season Wentworth watch and stupid cunt work ruins for KirstyGA raging oldcunt',
  'anger'),
 ('why does tracking show equipment delivered when wasn Why service suddenly delayed already weeks fuming',
  'anger'),
 ('TeamShanny legit why furious with him people are such fucking idiots',
  'anger'),
 ('How suppose work you that Wtf dude Thanks for pissing off', 'anger')]

In [45]:
#!python -m spacy download en - (this downloads spacy framework) 

In [46]:
#this loads the spacy model
nlp = spacy.load('en_core_web_md')

In [47]:
#the function textcat helps to create a category to the text which contains no category
if 'textcat' not in nlp.pipe_names:
# Adding the built-in textcat component to the pipeline.
    textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
    print(nlp.pipe_names)
else:
    textcat = nlp.get_pipe("textcat")

['tagger', 'parser', 'ner', 'textcat']


In [48]:
#this function gets all the unique labels from the dataset
uniq_labels = list(df_all['emotion'].unique().tolist())

#this makes the categories in the spacy format 
def get_categories(categories, uniq_labels_):
    cats = dict(zip(uniq_labels_,[0.0]*len(uniq_labels_)))
    for category in categories:
        cats[category] = 1.0
    return cats

In [49]:
for each_label in uniq_labels:
    textcat.add_label(each_label)

In [50]:
#this function enable to split the dataset into train and development in a random way
import random

def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [get_categories([y],uniq_labels) for y in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

n_texts=7000

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:1]

[('There always that one song which makes you turn the radio you rather sit silence awful',
  {'cats': {'anger': 0.0,
    'normal': 0.0,
    'fear': 1.0,
    'sadness': 0.0,
    'joy': 0.0}})]

In [51]:
#this trains the model and evaluates based on the dataset
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


#("Number of training iterations", "n", int))
n_iter=5

##### Training the dataset

In [94]:
from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
1.624	0.783	0.765	0.774
1.426	0.786	0.767	0.776
1.284	0.784	0.773	0.778
1.503	0.787	0.774	0.780
1.425	0.782	0.773	0.778


In [95]:
#this funtion round's off the floating point number into 2 decimal places
def roundoff(dict_y):
    for k, v in dict_y.items():
        v = round(v,2) 
        dict_y[k] = v 
    return dict_y

In [100]:
#this function tets the emotion of the inputted text
test_text= "i love you"
doc=nlp(test_text)
roundoff(doc.cats)

{'anger': 0.0, 'normal': 0.0, 'fear': 0.09, 'sadness': 0.0, 'joy': 0.91}

In [29]:
from pathlib import Path

output_dir = current_path
new_model_name = 'emotion_detection'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta['name'] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /Users/vineeth/Dropbox/Code Base


In [1]:
#!pip install -U Flask

Requirement already up-to-date: Flask in /Users/vineeth/opt/miniconda3/lib/python3.8/site-packages (1.1.2)


In [2]:
from flask import Flask

app = Flasks(__name__)

@app.route("/")
def hello():
    return "Hello, World!"

In [26]:
current_path

'/Users/vineeth/Dropbox/Code Base/'