# NLP

importing required libraries

In [1]:
import spacy
import json
import pandas as pd

verifying the Path of file

In [2]:
import os
print("Current working directory:", os.getcwd())
os.chdir('/Users/varunvaddi/desktop/NLP')

Current working directory: /Users/varunvaddi/Desktop/NLP


## PHASE 1

### Opening the Manually Annotated JSON File

loading the spaCy's Large model

In [3]:
nlp=spacy.load('en_core_web_lg')

Opening the Manually Annotated JSON file into Jupyter Notebook

In [4]:
with open('NER_Tweets_300.json','r') as f:
  Manual_NER = json.load(f)

Manual_NER_list = []

for i in Manual_NER['annotations']:
  if i != None:
    Manual_NER_list.append(
        {
            'sentence': i[0],
            'entities': i[1]['entities']
        }
    )

Manual_NER_list[:2]

[{'sentence': 'EU Commission President Jose Manuel Barroso has said the 25-member bloc will make it clear to the Russian President that the EU is not " satisfied " with Ukraine \'s disputed election .\r',
  'entities': [[0, 2, 'ORG'],
   [24, 43, 'PERSON'],
   [98, 105, 'NORP'],
   [125, 127, 'ORG'],
   [154, 161, 'GPE']]},
 {'sentence': "Prime Minister Viktor Yanukovych -- who was backed by Mr. Putin -- has been declared the winner of Sunday 's voting , but the EU is rejecting the results because of allegations of widespread voting fraud .\r",
  'entities': [[15, 32, 'PERSON'],
   [58, 63, 'PERSON'],
   [99, 105, 'DATE'],
   [126, 128, 'ORG']]}]

loading the data onto the dataframe from CSV

In [5]:
df = pd.read_csv('NER_Tweets_Data.csv')
df.head()

Unnamed: 0,tweets
0,EU Commission President Jose Manuel Barroso ha...
1,Prime Minister Viktor Yanukovych -- who was ba...
2,"On other issues , EU leaders say they will cal..."
3,Palestinian President Mahmoud Abbas has fired ...
4,"Mr. Abbas dismissed the men , including securi..."


In [6]:
df.describe()

Unnamed: 0,tweets
count,300
unique,300
top,EU Commission President Jose Manuel Barroso ha...
freq,1


## PHASE 2

### Separating the 7 Named Entity Tags only

In [7]:
spacy_output = []
tags = {"PERSON", "NORP", "ORG", "GPE", "LOC", "DATE", "MONEY"}
for sentence in df['tweets']:
    doc = nlp(sentence)
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in tags]
    spacy_output.append({"sentence": sentence, "entities": entities})

# Display the results
for result in spacy_output:
    print(f"Sentence: {result['sentence']}")
    for entity in result['entities']:
        print(f"  Entity: {entity[0]}, Label: {entity[1]}")
    print("\n")

Sentence: EU Commission President Jose Manuel Barroso has said the 25-member bloc will make it clear to the Russian President that the EU is not " satisfied " with Ukraine 's disputed election .
  Entity: EU Commission, Label: ORG
  Entity: Jose Manuel Barroso, Label: PERSON
  Entity: Russian, Label: NORP
  Entity: EU, Label: ORG
  Entity: Ukraine, Label: GPE


Sentence: Prime Minister Viktor Yanukovych -- who was backed by Mr. Putin -- has been declared the winner of Sunday 's voting , but the EU is rejecting the results because of allegations of widespread voting fraud .
  Entity: Viktor Yanukovych, Label: PERSON
  Entity: Putin, Label: PERSON
  Entity: Sunday, Label: DATE
  Entity: EU, Label: ORG


Sentence: On other issues , EU leaders say they will call on Russia to sign and ratify border agreements with Estonia and Latvia , and urge it to seek a political settlement in war-torn Chechnya .
  Entity: EU, Label: ORG
  Entity: Russia, Label: GPE
  Entity: Estonia, Label: GPE
  Entity

## PHASE 3

In [8]:
# Named Entities tagged by the SpaCy model
spacy_output[:300]

[{'sentence': 'EU Commission President Jose Manuel Barroso has said the 25-member bloc will make it clear to the Russian President that the EU is not " satisfied " with Ukraine \'s disputed election .',
  'entities': [('EU Commission', 'ORG'),
   ('Jose Manuel Barroso', 'PERSON'),
   ('Russian', 'NORP'),
   ('EU', 'ORG'),
   ('Ukraine', 'GPE')]},
 {'sentence': "Prime Minister Viktor Yanukovych -- who was backed by Mr. Putin -- has been declared the winner of Sunday 's voting , but the EU is rejecting the results because of allegations of widespread voting fraud .",
  'entities': [('Viktor Yanukovych', 'PERSON'),
   ('Putin', 'PERSON'),
   ('Sunday', 'DATE'),
   ('EU', 'ORG')]},
 {'sentence': 'On other issues , EU leaders say they will call on Russia to sign and ratify border agreements with Estonia and Latvia , and urge it to seek a political settlement in war-torn Chechnya .',
  'entities': [('EU', 'ORG'),
   ('Russia', 'GPE'),
   ('Estonia', 'GPE'),
   ('Latvia', 'GPE'),
   ('Chechny

In [9]:
# Manually tagged Named Entities
Manual_NER_list[:300]

[{'sentence': 'EU Commission President Jose Manuel Barroso has said the 25-member bloc will make it clear to the Russian President that the EU is not " satisfied " with Ukraine \'s disputed election .\r',
  'entities': [[0, 2, 'ORG'],
   [24, 43, 'PERSON'],
   [98, 105, 'NORP'],
   [125, 127, 'ORG'],
   [154, 161, 'GPE']]},
 {'sentence': "Prime Minister Viktor Yanukovych -- who was backed by Mr. Putin -- has been declared the winner of Sunday 's voting , but the EU is rejecting the results because of allegations of widespread voting fraud .\r",
  'entities': [[15, 32, 'PERSON'],
   [58, 63, 'PERSON'],
   [99, 105, 'DATE'],
   [126, 128, 'ORG']]},
 {'sentence': 'On other issues , EU leaders say they will call on Russia to sign and ratify border agreements with Estonia and Latvia , and urge it to seek a political settlement in war-torn Chechnya .\r',
  'entities': [[18, 20, 'ORG'],
   [51, 57, 'GPE'],
   [100, 107, 'GPE'],
   [112, 118, 'GPE'],
   [176, 184, 'GPE']]},
 {'sentence': 'Pale

function to get start and end indexes of each named entity

In [10]:
def retrieve_entity_positions(sentence, entity_text):
    """Find the start and end positions of the entity in the sentence."""
    start = sentence.find(entity_text)
    if start == -1:  # If the entity is not found, then return None
        return None
    end = start + len(entity_text)
    return start, end

Since, JSON output is having start & end indices instead of named entity, we need to convert the spaCy output from named entity to index positions, so that both of them will have same format inorder for Fair comparison

In [11]:
def format_conversion_from_spacy_to_manual(spacy_NER):
    """Convert spaCy format (word, tag) to manual format (start, end, tag)."""
    annotations_after_conversion = []

    for entry in spacy_NER:
        sentence = entry['sentence']
        entities = []

        for entity_text, entity_tag in entry['entities']:
            positions = retrieve_entity_positions(sentence, entity_text) #get each entity start & end index positions just like manual JSON output
            if positions:
                start, end = positions
                entities.append([start, end, entity_tag])

        annotations_after_conversion.append({'sentence': sentence, 'entities': entities})

    return annotations_after_conversion

function to calculate model metrics like TP, FP, FN, Precision, Recall & F1-Score

In [12]:
def calculate_metrics(manual_NER, spacy_NER):
    tp, fp, fn = 0, 0, 0
    tp_tweet, fn_tweet, fp_tweet = 0, 0, 0
    temp_counter = 0
    
    for manual, spacy in zip(manual_NER, spacy_NER):
        # Convert manual annotations into a set of tuples (start, end, label)
        set_manual_entities = set([tuple(entity) for entity in manual['entities']])

        # Convert spaCy annotations into a set of tuples (start, end, label)
        set_spacy_entities = set([tuple(entity) for entity in spacy['entities']])

        
        # True Positives
        tp_tweet = len(set_manual_entities & set_spacy_entities)
        tp = tp + tp_tweet
        # False Positives
        fp_tweet = len(set_spacy_entities - set_manual_entities)
        fp = fp + fp_tweet
        # False Negatives
        fn_tweet = len(set_manual_entities - set_spacy_entities)
        fn = fn + fn_tweet
        temp_counter += 1

        precision_tweet = tp_tweet / (tp_tweet + fp_tweet) if (tp_tweet + fp_tweet) > 0 else 0
        recall_tweet = tp_tweet / (tp_tweet + fn_tweet) if (tp_tweet + fn_tweet) > 0 else 0
        f1_tweet = 2 * (precision_tweet * recall_tweet)/ (precision_tweet + recall_tweet) if (precision_tweet + recall_tweet)> 0 else 0
        
        print("--------------------------------------------------------------------------------------------------------------------")
        print(f"tweet: {temp_counter}")
        print(set_manual_entities)
        print(set_spacy_entities)
        print("---------------------------")
        print(f"TP: {tp_tweet} | FP: {fp_tweet} | FN: {fn_tweet}")
        print("---------------------------")
        print(f"Precision of tweet: {precision_tweet}")
        print(f"Recall of tweet: {recall_tweet}")
        print(f"f1-score of tweet: {f1_tweet}")

    return tp, fp, fn

Displaying Overall model metrics

In [13]:
# Convert spaCy format to manual format
spacy_after_conversion = format_conversion_from_spacy_to_manual(spacy_output)

# Now calculate TP, FP, FN using the converted spaCy annotations
tp, fp, fn = calculate_metrics(Manual_NER_list, spacy_after_conversion)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall)/ (precision + recall) if (precision + recall)> 0 else 0

--------------------------------------------------------------------------------------------------------------------
tweet: 1
{(154, 161, 'GPE'), (0, 2, 'ORG'), (125, 127, 'ORG'), (24, 43, 'PERSON'), (98, 105, 'NORP')}
{(154, 161, 'GPE'), (0, 2, 'ORG'), (0, 13, 'ORG'), (24, 43, 'PERSON'), (98, 105, 'NORP')}
---------------------------
TP: 4 | FP: 1 | FN: 1
---------------------------
Precision of tweet: 0.8
Recall of tweet: 0.8
f1-score of tweet: 0.8000000000000002
--------------------------------------------------------------------------------------------------------------------
tweet: 2
{(126, 128, 'ORG'), (15, 32, 'PERSON'), (99, 105, 'DATE'), (58, 63, 'PERSON')}
{(126, 128, 'ORG'), (15, 32, 'PERSON'), (99, 105, 'DATE'), (58, 63, 'PERSON')}
---------------------------
TP: 4 | FP: 0 | FN: 0
---------------------------
Precision of tweet: 1.0
Recall of tweet: 1.0
f1-score of tweet: 1.0
----------------------------------------------------------------------------------------------------

In [14]:
print("-----------------------------------------------------")
print("Manual NER vs SpaCy NER:")
print("----------------------------------------------------- \n")
print(f"Overall Precision: {precision}")
print(f"Overall Recall: {recall}")
print(f"Overall f1-score: {f1}")
print("-----------------------------------------------------")
print(f"Overall True Positives (TP): {tp}")
print(f"Overall False Positives (FP): {fp}")
print(f"Overall False Negatives (FN): {fn}")

-----------------------------------------------------
Manual NER vs SpaCy NER:
----------------------------------------------------- 

Overall Precision: 0.36363636363636365
Overall Recall: 0.36681222707423583
Overall f1-score: 0.3652173913043478
-----------------------------------------------------
Overall True Positives (TP): 252
Overall False Positives (FP): 441
Overall False Negatives (FN): 435


Summary:
- Precision and recall are relatively low, indicating that spaCy's NER model is making significant number of False Positives and False Negative predictions compared to manual NER.
- The number of false positives is slightly higher than false negatives, which implies that SpaCy is more likely to incorrectly identify entities that do not exist than to miss out on existing ones
- The relatively low F1-score reinforces that the performance of SpaCy's NER in this case is suboptimal, and there's room for improvement. 