# Twitter sentiment extraction

This is my first attemp to train NER model using spaCy package.

References: 
- https://www.kaggle.com/uyangas/twitter-sentiment-extaction-analysis-eda-and-model
- https://spacy.io/usage/examples

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/tweet-sentiment-extraction/train.csv


In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
sample = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")

In [4]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
train['length'] = [len(str(i).split()) for i in train.text]
train = train[train['length']>=3]
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,length
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10
2,088c60f138,my boss is bullying me...,bullying me,negative,5
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14


In [6]:
import spacy
import random
from tqdm import tqdm
from spacy.lang import en
from spacy.util import minibatch, compounding

Function that saves the model

In [7]:
def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = f"../working/{output_dir}"
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        
        print("Saved model to", output_dir)

In [8]:
def get_model_dir(sentiment):
    '''
    Returns Model output path
    '''
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_positive'
    elif sentiment == 'negative':
        model_out_path = 'models/model_negative'
    return model_out_path

Function that creates the dataset

In [9]:
def extract_data(sentiment, df_train):
    '''Creates training data in format that spacy model can understand.
    The data is a tuple of a text and entity: (text, {'entities':(start, end, selected_text)})
    
    where:
    text: actual text
    entities: start and end of the word that is being identified
    selected_text: entity name
    '''
    
    train_data = []
    for index, line in df_train.iterrows():
        if line['sentiment'] == sentiment:
            text = line.text
            selected = line.selected_text
            start = text.find(selected)
            end = start + len(selected)
            train_data.append((text, {'entities':[(start, end, selected)]}))
            
    print("<< TRAIN DATA CREATED >>")
    return train_data

Train the model

In [10]:
def model_train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()


        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [11]:
sentiment = 'positive'
train_data = extract_data(sentiment, train)
output_dir = get_model_dir(sentiment)

model_train(train_data, output_dir, 7)

<< TRAIN DATA CREATED >>
Created blank 'en' model


 14%|█▍        | 1/7 [11:40<1:10:00, 700.10s/it]

Losses {'ner': 45635.24517799465}


 29%|██▊       | 2/7 [23:22<58:24, 700.86s/it]  

Losses {'ner': 37569.90769448121}


 43%|████▎     | 3/7 [35:03<46:43, 700.87s/it]

Losses {'ner': 35151.30553745678}


 57%|█████▋    | 4/7 [46:42<35:00, 700.29s/it]

Losses {'ner': 33336.152208729196}


 71%|███████▏  | 5/7 [58:24<23:21, 700.78s/it]

Losses {'ner': 32886.87284834079}


 86%|████████▌ | 6/7 [1:10:09<11:42, 702.00s/it]

Losses {'ner': 31990.116289977224}


100%|██████████| 7/7 [1:21:50<00:00, 701.51s/it]

Losses {'ner': 31116.36245565485}





Saved model to ../working/models/model_positive


In [12]:
sentiment = 'negative'
train_data = extract_data(sentiment, train)
output_dir = get_model_dir(sentiment)

model_train(train_data, output_dir, 7)

<< TRAIN DATA CREATED >>
Created blank 'en' model


 14%|█▍        | 1/7 [11:30<1:09:04, 690.80s/it]

Losses {'ner': 44989.06116682953}


 29%|██▊       | 2/7 [23:01<57:34, 690.84s/it]  

Losses {'ner': 37996.908570554384}


 43%|████▎     | 3/7 [34:37<46:08, 692.22s/it]

Losses {'ner': 35863.64316424511}


 57%|█████▋    | 4/7 [46:29<34:54, 698.33s/it]

Losses {'ner': 34559.194435192956}


 71%|███████▏  | 5/7 [58:00<23:12, 696.12s/it]

Losses {'ner': 33784.63994754338}


 86%|████████▌ | 6/7 [1:09:33<11:35, 695.23s/it]

Losses {'ner': 33055.96251623852}


100%|██████████| 7/7 [1:21:08<00:00, 695.51s/it]

Losses {'ner': 32320.160524706156}





Saved model to ../working/models/model_negative


Prediction function

In [13]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    
    return selected_text

In [14]:
selected_texts = []
path = "../working/models/"

if path is not None:
    model_positive = spacy.load(path + "model_positive")
    model_negative = spacy.load(path + "model_negative")
    print("Models loaded")
    
    print("Prediction started")
    for index, line in test.iterrows():
        text = line.text

        if line.sentiment == 'neutral': #or len(text.split()) <= 2:
            selected_texts.append(text)
        elif line.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_positive))
        else:
            selected_texts.append(predict_entities(text, model_negative))
    print("Prediction finished")
    
test['selected_text'] = selected_texts

Models loaded
Prediction started
Prediction finished


In [15]:
sample['selected_text'] = test['selected_text']
sample.to_csv('submission.csv', index=False)
sample.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!


In [16]:
test.head(20)

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!,positive,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http://twitpic.com/4w75p - I like it!!
5,726e501993,that`s great!! weee!! visitors!,positive,that`s great!! weee!! visitors!
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,I THINK EVERYONE HATES ME ON HERE lol
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative,"soooooo wish i could, but im in school and my..."
8,e64208b4ef,and within a short time of the last clue all ...,neutral,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral,What did you get? My day is alright.. haven`...


The end of the notebook