## 1. Importing Libraries

In [5]:
import pandas as pd 
import re
import os
import spacy
import string
import numpy as np 
import random
import nltk
from nltk.corpus import stopwords
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore")
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from spacy.util import compounding
from spacy.util import minibatch

## 2. Loading the Data

In [46]:
train = pd.read_csv('tweet-sentiment-extraction/train.csv')
test = pd.read_csv('tweet-sentiment-extraction/test.csv')

### 2.1 Train Data Understanding

In [47]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [48]:
train.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,9ee413df0b,Following _aisa These guys could be shifty but...,good,neutral
freq,1,1,199,11118


### 2.2 Test Data Understanding

In [49]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [50]:
test.describe()

Unnamed: 0,textID,text,sentiment
count,3534,3534,3534
unique,3534,3534,3
top,40e3087f9c,I`m at work and I`m still sick and I`m really ...,neutral
freq,1,1,1430


In [51]:
train_data = train
test_data = test
df_submission = pd.read_csv('tweet-sentiment-extraction/sample_submission.csv')

In [52]:
train_data['Tweet_amount'] = train_data['text'].apply(lambda x:len(str(x).split())) #Number Of words in main Text in train set

In [53]:
train_data = train_data[train_data['Tweet_amount']>=3]

In [54]:
def save_model(path, nlp, new_model_name):
    #saving model to a given path  
    path = f'../working/{path}'
    if path is not None:        
        if not os.path.exists(path):
            os.makedirs(path)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(path)
        print("Saved model to", path)

In [55]:
# pass model = nlp if you want to train on top of existing model 

def train(train_data, path, iterations=20, model=None):
    #Load the model, set up the pipeline and train the entity recognizer
    if model is not None:
        nlp = spacy.load(path)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()


        for itn in tqdm(range(iterations)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(path, nlp, 'st_ner')

In [56]:
def get_model_out_path(sentiment):
    #Returns Model output path
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    return model_out_path

In [57]:
def get_training_data(sentiment):
    
    #Return Training data in the required format
    
    return_data = []
    for index, row in train_data.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            return_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return return_data

In [58]:
sentiment = 'positive'

training = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
# For DEmo Purposes I have taken 3 iterations you can train the model as you want
train(training, model_path, iterations=3, model=None)

Created blank 'en' model


 33%|████████████████████████████                                                        | 1/3 [01:37<03:14, 97.34s/it]

Losses {'ner': 34043.1057221723}


 67%|████████████████████████████████████████████████████████                            | 2/3 [03:16<01:37, 97.74s/it]

Losses {'ner': 31175.016752837713}


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:52<00:00, 97.46s/it]

Losses {'ner': 29369.49760783429}
Saved model to ../working/models/model_pos





In [59]:
sentiment = 'negative'

training = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(training, model_path, iterations=3, model=None)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Created blank 'en' model


 33%|████████████████████████████                                                        | 1/3 [01:10<02:20, 70.39s/it]

Losses {'ner': 32232.013164540782}


 67%|████████████████████████████████████████████████████████                            | 2/3 [02:31<01:13, 73.58s/it]

Losses {'ner': 29049.085896846507}


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:44<00:00, 74.89s/it]

Losses {'ner': 26970.56469711427}
Saved model to ../working/models/model_neg





In [60]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

In [61]:
def jaccard(a, b): 
    x = set(a.lower().split()) 
    y = set(b.lower().split())
    return float(len(x.intersection(y))) / (len(x) + len(y) - len(x.intersection(y)))

In [68]:
selected_texts = []
MODELS_BASE_PATH = '../working/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
        
    for index, row in train_data.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
train_data['selected_text_pred'] = selected_texts

Loading Models  from  ../working/models/


In [69]:
train_data.reset_index(drop=True, inplace=True)

In [89]:
temp = []
for i in range(len(train_data)):
    temp.append(jaccard(train_data['selected_text'][i], train_data['selected_text_pred'][i]))
train_data['jaccard'] = temp

In [90]:
train_data['jaccard'].mean()

0.6574537432439022

In [91]:
selected_texts = []
MODELS_BASE_PATH = '../working/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
        
    for index, row in test_data.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
test_data['selected_text'] = selected_texts

Loading Models  from  ../working/models/


In [92]:
df_submission['selected_text'] = test_data['selected_text']
df_submission.to_csv("submission.csv", index=False)
display(df_submission.head(10))

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,shame!
3,01082688c6,happy bday!
4,33987a8ee5,I like it!!
5,726e501993,that`s great!! weee!! visitors!
6,261932614e,HATES
7,afa11da83f,blocked
8,e64208b4ef,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...


In [4]:
!pip install spaCy --user

Collecting spaCy
  Using cached spacy-2.2.4-cp36-cp36m-win_amd64.whl (9.9 MB)
Collecting wasabi<1.1.0,>=0.4.0
  Using cached wasabi-0.6.0-py3-none-any.whl (20 kB)
Collecting blis<0.5.0,>=0.4.0
  Using cached blis-0.4.1-cp36-cp36m-win_amd64.whl (5.0 MB)
Collecting thinc==7.4.0
  Using cached thinc-7.4.0-cp36-cp36m-win_amd64.whl (2.1 MB)
Collecting plac<1.2.0,>=0.9.6
  Using cached plac-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.2-cp36-cp36m-win_amd64.whl (179 kB)
Collecting importlib-metadata>=0.20; python_version < "3.8"
  Using cached importlib_metadata-1.6.1-py2.py3-none-any.whl (31 kB)
Collecting zipp>=0.5
  Using cached zipp-3.1.0-py3-none-any.whl (4.9 kB)
Installing collected packages: wasabi, blis, plac, zipp, importlib-metadata, catalogue, srsly, thinc, spaCy
Successfully installed blis-0.4.1 catalogue-1.0.0 importlib-metadata-1.6.1 plac

You should consider upgrading via the 'c:\users\vedan\anaconda3\python.exe -m pip install --upgrade pip' command.


NameError: name 'user' is not defined

In [72]:
train_data

Unnamed: 0,textID,text,selected_text,sentiment,Tweet_amount,selected_text_pred
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10,Sooo SAD I will miss you here in San Diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,5,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14,"Sons of ****, why couldn`t they put them on t..."
...,...,...,...,...,...,...
26747,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,16,wish we could come see u on Denver husband l...
26748,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,23,I`ve wondered about rake to. The client has ...
26749,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,22,Yay good for both of you. Enjoy the break - yo...
26750,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,6,worth


Unnamed: 0,textID,text,selected_text,sentiment,Tweet_amount,selected_text_pred
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10,Sooo SAD I will miss you here in San Diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,5,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14,"Sons of ****, why couldn`t they put them on t..."
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,12,http://www.dothebouncy.com/smf - some shameles...
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,14,fun
7,e050245fbd,Both of you,Both of you,neutral,3,Both of you
8,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,10,Wow...
9,2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral,23,"as much as i love to be hopeful, i reckon the..."


In [84]:
temp = []
for i in range(5):
    temp[i] = jaccard(train_data['selected_text'][i],train_data['selected_text_pred'][i])

IndexError: list assignment index out of range

In [88]:
temp

[1.0,
 0.2,
 0.16666666666666666,
 0.6,
 0.21428571428571427,
 1.0,
 1.0,
 1.0,
 0.2,
 1.0,
 0.0,
 0.125,
 1.0,
 1.0,
 0.2,
 0.1111111111111111,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.09090909090909091,
 0.1111111111111111,
 1.0,
 0.375,
 0.07692307692307693,
 0.09090909090909091,
 0.3333333333333333,
 0.2,
 1.0,
 0.5714285714285714,
 1.0,
 1.0,
 0.2,
 0.25,
 1.0,
 0.0,
 1.0,
 0.16666666666666666,
 1.0,
 0.0625,
 1.0,
 0.08695652173913043,
 0.3888888888888889,
 1.0,
 1.0,
 1.0,
 1.0,
 0.25,
 1.0,
 1.0,
 0.8,
 0.6666666666666666,
 1.0,
 0.6153846153846154,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.1111111111111111,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.92,
 1.0,
 0.3076923076923077,
 1.0,
 0.0,
 1.0,
 0.8,
 1.0,
 1.0,
 0.1111111111111111,
 0.043478260869565216,
 0.14285714285714285,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.07692307692307693,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 0.16666666666666666,
 1.0,
 0.3333333333333333,
