In [None]:
import pandas as pd
import spacy
import torch
from transformers import pipeline
from keybert import KeyBERT

In [None]:
df = pd.read_csv("C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\CompanyRelationships.csv")

In [None]:
df.head()

## Filter DataFrame

In [None]:
# Removing the filter to have all files in analysis
"""
print("Length before filtering...")
print(len(df))
df = df[df['File']=='[__EXTERNAL__] Chevron New Energies Daily News ...(1).pdf']
print("Length after filtering...")
print(len(df))
"""

## Perform NER

In [None]:
## DOWNLOAD the model from spacy if you have not done so already
#! python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [None]:
# Extract all organizational entities from each article.  Disregard all other entities.
article = [_ for _ in df['Text']]

entities_by_article = []
for doc in nlp.pipe(article):
  organization = []
  for ent in doc.ents:
    if ent.label_ == "ORG":
      organization.append(ent)
  entities_by_article.append(organization)

# Append all organization entities back to the table
df['Organizations'] = pd.Series(entities_by_article)

In [None]:
# Remove articles that do not mention organizations
def count_length(row):
    # Return the unique number of companies mentioned in the article.  Set removes repeats.
    return len(set(row))

df['Filter'] = df['Organizations'].apply(count_length)
# Filter to only observations that have more than 1 company mentioned
df = df[df['Filter']>1]
del df['Filter']

df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

## Add Sentiment

In [None]:
sentiment_analyzer = pipeline("sentiment-analysis")

articles = df['Text'].values.tolist()

df['Sentiment'] = sentiment_analyzer(articles)

## Add Key Phrases

In [None]:
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')

In [None]:
df['Text'][0]

In [None]:
df['KeyPhrases'] = kw_model.extract_keywords(df['Text'],
                                            keyphrase_ngram_range=(1, 3),
                                            stop_words='english',
                                            use_mmr=True,
                                            diversity=0.3)

## Final Cleanup

In [None]:
df = df[['Text', 'Organizations', 'Sentiment', 'KeyPhrases']]

In [None]:
len(df)

## Export to File

In [None]:
df.to_csv("C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\cleansed_CNE_articles.csv", index=False)

Steps
 - Filter to only articles that mention 2 or more companies [NER]
 - Provide Sentiment on each article
 - Pull key phrases from each article
 - 