###### Illinois is famous for being one of the very few states in the country with negative population growth.  The objective of your final project is to:

* Identify the key reasons for the declining population by extracting meaningful insights from unstructured text
* Provide actionable recommendations on what can be done to reverse this trend

In [4]:
!pip install jupyterthemes
!jt -t chesterish




In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [2]:
data = pd.read_pickle("/project/msca/kadochnikov/news/news_chicago_il.pkl")

## Clean-up the noise (eliminate articles irrelevant to the analysis)

In [3]:
#Choose the article only in English
data=data[data.language=='english']

In [4]:
# Make sure there is no NaNs in the dataset
data=data.dropna()

In [5]:
# Remove articles that have no word 'population' in it
list=[]
for i in range(0,data.shape[0]):
    
    list.append('population'in data.text[i])
    
data=data[list]

In [6]:
data.shape
# There are about 5015 articles that have at lease one 'population' in the article

(5015, 4)

In [7]:
# Reset the pandas dataframe index
data=data.reset_index(drop=True)

## Detect major topics

#### 1. Use Tokenization, Lemmatization, word similarity and context

* Tokenization, Lemmatization

In [8]:
# Concatenate strings 
text=data.text.str.cat(sep=' ')

In [144]:
# Tokenize the text
from nltk.tokenize import word_tokenize
text_token=word_tokenize(text.lower())

In [145]:
# Clean data: convert to lower case, remove stopwords, punctuation, numbers, etc
from nltk.corpus import stopwords
# Only return alphabetic string(strip tokens with numbers or punctuation)
tokens = [w for w in text_token
         if w.isalpha()]
#Remove stopwords(useless words like and or ) in english 
no_stops = [t for t in tokens
           if t not in stopwords.words('english')]

In [154]:
# pickle no_stops since it took a lot time to run

with open('no_stops.pkl', 'wb') as f:
    pickle.dump(no_stops, f)

In [9]:
# Read Pickle file as list type
with open("no_stops.pkl", "rb") as fp:
    no_stops = pickle.load(fp)

In [10]:
# Use Lemmatizaion to get the better idea of the words

In [None]:
# Find top-10 tokens after applying lemmatization to the tokens


from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized=[wordnet_lemmatizer.lemmatize(w) for w in no_stops]

In [None]:
from collections import Counter
Counter(lemmatized).most_common(20)

In [13]:
#### We have to instantiate a Text object first, and then call it on that object.
import nltk as nltk
from nltk.text import Text
textList = Text(no_stops)

* shows every occurrence of a given word, together with some context.

In [14]:
textList.concordance("population")

Displaying 25 of 7588 matches:
etropolitan area central illinois population approximately people according ce
etropolitan area central illinois population approximately people according ce
square miles state illinois total population catholic keywords conference cath
monthly help determine percentage population contracted abbott said planned sh
al condition otherwise vulnerable population hopefully dialogue able lightfoot
s chicago despite accounting city population reporting illinois department pub
espite making less fifth illinois population latinos accounted close cases sta
orts funds allocated based states population size heavily impacted outbreak we
 changed decrease risk vulnerable population illinois department public health
ing facilities across state areas population risk said decades institutional i
 dollars given states often based population counts illinois response rate cen
disease pritzker said based state population effort illinois likely twice simi
rs spending million m

* Using "similar" helps us discover what other words appear in a similar range of contexts

In [15]:
textList.similar("population")

illinois residents people state chicago percent cases deaths community
total according city officials populations new said million public
number black


#### 2. N Grams

In [16]:
# #tokens = nltk.tokenize.word_tokenize(raw)
# tokens = nltk.word_tokenize(text)

# #stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

# word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = no_stops

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords]
tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and b[2] not in stopwords]

In [17]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(public, health)",3559
1,"(cook, county)",3144
2,"(pritzker, said)",2354
3,"(new, york)",2013
4,"(health, care)",1932
5,"(social, distancing)",1798
6,"(illinois, department)",1719
7,"(gov, pritzker)",1681
8,"(tested, positive)",1674
9,"(united, states)",1543


In [18]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(department, public, health)",1354
1,"(mayor, lori, lightfoot)",1225
2,"(illinois, department, public)",979
3,"(cook, county, jail)",781
4,"(chicago, mayor, lori)",637
5,"(gov, jb, pritzker)",620
6,"(president, donald, trump)",582
7,"(public, health, officials)",449
8,"(donald, trump, cloud)",428
9,"(trump, cloud, spotted)",428


#### Creating targeted N-Grams

In [19]:

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'population' or b[1] == 'population')]

tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'population' or b[1] == 'population' or b[2] == 'population')]

# population must be one of the words

In [20]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(county, population)",363
1,"(illinois, population)",319
2,"(jail, population)",303
3,"(state, population)",298
4,"(city, population)",262
5,"(population, census)",192
6,"(black, population)",178
7,"(population, loss)",173
8,"(population, million)",170
9,"(percentage, population)",167


In [21]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(county, population, census)",185
1,"(significant, percentage, population)",124
2,"(percentage, population, underserved)",124
3,"(population, underserved, christmas)",124
4,"(jail, population, dropped)",122
5,"(size, county, population)",108
6,"(county, population, county)",108
7,"(population, county, income)",108
8,"(making, city, population)",103
9,"(city, jewish, population)",88


### Creating N-Grams of custom length

In [22]:
raw=text
n = 4
fourgrams = nltk.ngrams(raw.split(), n)

n = 5
fivegrams = nltk.ngrams(raw.split(), n)

n = 6
sixgrams = nltk.ngrams(raw.split(), n)

In [23]:
fdist_4 = nltk.FreqDist(fourgrams)
fdist_4_df = pd.DataFrame(fdist_4.most_common(),
                    columns=['Word', 'Frequency'])

fdist_4_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(the, Illinois, Department, of)",1006
1,"(Illinois, Department, of, Public)",962
2,"(Department, of, Public, Health)",924
3,"(the, spread, of, the)",898
4,"(Chicago, Mayor, Lori, Lightfoot)",516
5,"(tested, positive, for, the)",474
6,"(Re:, Donald, Trump, cloud)",428
7,"(Donald, Trump, cloud, spotted)",428
8,"(Trump, cloud, spotted, over)",428
9,"(cloud, spotted, over, Chicago)",428


In [24]:
fdist_5 = nltk.FreqDist(fivegrams)
fdist_5_df = pd.DataFrame(fdist_5.most_common(),
                    columns=['Word', 'Frequency'])

fdist_5_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(Illinois, Department, of, Public, Health)",680
1,"(the, Illinois, Department, of, Public)",511
2,"(Re:, Donald, Trump, cloud, spotted)",428
3,"(Donald, Trump, cloud, spotted, over)",428
4,"(Trump, cloud, spotted, over, Chicago)",428
5,"(cloud, spotted, over, Chicago, this)",428
6,"(spotted, over, Chicago, this, weekend.)",428
7,"(over, Chicago, this, weekend., Posted)",428
8,"(Chicago, this, weekend., Posted, by:)",428
9,"(this, weekend., Posted, by:, r.w.s.)",407


In [25]:
fdist_6 = nltk.FreqDist(sixgrams)
fdist_6_df = pd.DataFrame(fdist_6.most_common(),
                    columns=['Word', 'Frequency'])

fdist_6_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(Re:, Donald, Trump, cloud, spotted, over)",428
1,"(Donald, Trump, cloud, spotted, over, Chicago)",428
2,"(Trump, cloud, spotted, over, Chicago, this)",428
3,"(cloud, spotted, over, Chicago, this, weekend.)",428
4,"(spotted, over, Chicago, this, weekend., Posted)",428
5,"(over, Chicago, this, weekend., Posted, by:)",428
6,"(Chicago, this, weekend., Posted, by:, r.w.s.)",407
7,"(this, weekend., Posted, by:, r.w.s., ())",407
8,"(weekend., Posted, by:, r.w.s., (), Date:)",407
9,"(older, adults, and, people, with, existing)",342


#### 3. Sentiment Analysis

In [26]:
# First apply it on the title
# Remove special characters to avoid problems with analysis
data['title_clean'] = data['title'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [27]:
# Visulize if there is a difference
pd.set_option('display.max_colwidth', 100)
data[['title', 'title_clean']].head(5)

Unnamed: 0,title,title_clean
0,Mayor Lightfoot announces plans for reopening Chicago,Mayor Lightfoot announces plans for reopening Chicago
1,"Jushi Announces the Beginning of Adult-Use Cannabis Sales at its Illinois Dispensary in Normal, ...","Jushi Announces the Beginning of AdultUse Cannabis Sales at its Illinois Dispensary in Normal, I..."
2,"Jushi Announces the Beginning of Adult-Use Cannabis Sales at its Illinois Dispensary in Normal, ...","Jushi Announces the Beginning of AdultUse Cannabis Sales at its Illinois Dispensary in Normal, I..."
3,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria
4,Coronavirus in Illinois updates: Here’s what’s happening Monday with COVID-19 in the Chicago area,Coronavirus in Illinois updates: Heres whats happening Monday with COVID19 in the Chicago area


## Demonstrate how the city / state can attract new businesses (positive sentiment)

In [28]:
# Find the titles with the positive attitude
data['polarity'] = data.apply(lambda x: TextBlob(x['title_clean']).sentiment.polarity, axis=1)
data['subjectivity'] = data.apply(lambda x: TextBlob(x['title_clean']).sentiment.subjectivity, axis=1)
data[['title_clean', 'polarity', 'subjectivity']][data['polarity'] > 0.8]

Unnamed: 0,title_clean,polarity,subjectivity
147,Beautiful Places to Visit in Illinois Burden Falls The Shawn,0.85,1.0
968,OpEd: Is bankruptcy the best option for Illinois,1.0,0.3
1202,"For Chicago, coronavirus is the perfect storm, Mayor Lori Lightfoot says",1.0,1.0
1266,Celebrating the life of legendary Chicago DJ Timbuck2 Event Recap,1.0,1.0
1588,EXCLUSIVE: Greatest fear: Chicago hides names of released prisoners from police,1.0,1.0
1892,Underperforming school battles to be one of Illinois best,1.0,0.3
1895,Underperforming school battles to be one of Illinois best,1.0,0.3
1898,Underperforming school battles to be one of Illinois best The Edwardsville Intelligencer,1.0,0.3
2146,"Daywatch: Pritzker says COVID19 wont peak until midMay, Chicago businesses await federal loans a...",1.0,0.3
2285,"2020 Best Brightest MBAs: Joshua Lah, University of Chicago Booth",1.0,0.3


In [29]:
# Positive article title number:
data[['title_clean', 'polarity', 'subjectivity']][data['polarity'] > 0].shape[0]

1733

In [30]:
# Negative article title number:
data[['title_clean', 'polarity', 'subjectivity']][data['polarity'] < 0].shape[0]

690

## Identify top reasons for population decline (negative sentiment)
* Suggest corrective actions

In [31]:
# Dictionary Based
# Find the titles with the Negative attitude
data[['title_clean', 'polarity', 'subjectivity']][data['polarity'] < -0.6]

Unnamed: 0,title_clean,polarity,subjectivity
94,ILLINOIS STATE SENATE DISTRICT 50: Senators push for answers on release of violent felons,-0.8,1.0
260,Chicago mayor says coronavirus is devastating AfricanAmerican community,-1.0,1.0
267,Chicago mayor says coronavirus is devastating AfricanAmerican community 99 Reporter,-1.0,1.0
462,"NBC 5 Investigates: Illinois, Indiana Rank On Lower End of Coronavirus Testing",-0.8,0.9
914,"With voters set to decide on progressive income tax, Illinois wealth flight among worst in the n...",-1.0,1.0
1641,Losing residents: Illinois sees worst population decline in the country this decade,-1.0,1.0
1643,Illinois saw nations worst population loss during the decade,-1.0,1.0
2208,ILLINOIS ONE OF WORST STATES FOR BUSINESS TAX CLIMATE,-1.0,1.0
2260,Mother Is Disgusted After Her 26YearOld Son Threw A Chicago House Party While She Worked A 24Hou...,-1.0,1.0
2495,"NBC 5 Investigates: Illinois, Indiana Rank On Lower End of Coronavirus Testing NBC Chicago",-0.8,0.9


* For this project it is import to focus on the reason why people are leaving Chicago, so i will pay more attention on the negative news title.
* Possible reason causing people to leave the city:
1. Fear of the relase of violent felons 
2. Covid cases in African American Community
3. Not enough covid testing
4. Progressive income tax
5. Busines Tax
6. Safety(video of brutal police shooting of unarmed man in Chicago train station sparks outrage)data.title[4940]

In [32]:
# Analyse the text related with the drop population
text1643=data.text[1643]
# This article is specifily talking about the population problem in Chicago
# I can do some analysis for example
# top n words
# n gram
# topic analysis
# etc

In [22]:
data.title[1643]

"Illinois saw nation's worst population loss during the decade"

In [33]:
# Tokenize the text
from nltk.tokenize import word_tokenize
text_token1643=word_tokenize(text1643.lower())

# Clean data: convert to lower case, remove stopwords, punctuation, numbers, etc
from nltk.corpus import stopwords
# Only return alphabetic string(strip tokens with numbers or punctuation)
tokens1643 = [w for w in text_token1643
         if w.isalpha()]
#Remove stopwords(useless words like and or ) in english 
no_stops1643 = [t for t in tokens1643
           if t not in stopwords.words('english')]

# Use Lemmatizaion to get the better idea of the words

# Find top-10 tokens after applying lemmatization to the tokens


from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized1643=[wordnet_lemmatizer.lemmatize(w) for w in no_stops1643]

from collections import Counter
Counter(lemmatized1643).most_common(10)

[('state', 50),
 ('illinois', 37),
 ('tax', 34),
 ('population', 28),
 ('decline', 18),
 ('would', 14),
 ('hike', 14),
 ('income', 14),
 ('resident', 12),
 ('year', 11)]

In [34]:
#### We have to instantiate a Text object first, and then call it on that object.
import nltk as nltk
from nltk.text import Text
textList1643 = Text(no_stops1643)

#shows every occurrence of a given word, together with some context.

textList1643.concordance("population")


Displaying 25 of 28 matches:
eased census bureau show illinois population dropped people largest raw declin
 largest raw decline state entire population naperville illinois city percenta
 beleaguered west virginia decade population decline especially loss prime adu
is simply kept pace average state population growth since start great recessio
n illinois labor force peak state population would million residents larger to
n residents larger today increase population would yield economy least estimat
son clair record unsubscribe time population decline continues july july illin
line continues july july illinois population shrank raw decline nation behind 
us bureau largest driver illinois population decline people leaving states arr
 per day one person every minutes population decline reached record levels yea
ent income tax hike state history population decline second year tax increase 
rtunity illinois consecutive year population decline among states west virgini
nia experienced consecu

In [35]:

#Using "similar" helps us discover what other words appear in a similar range of contexts

textList1643.similar("population")

economy


In [36]:
# By on analysis this article seems like the economy is one of the major reasons
# high debt high taxes is also the reason

In [37]:
#### 2. N Grams

# #tokens = nltk.tokenize.word_tokenize(raw)
# tokens = nltk.word_tokenize(text)

# #stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

# word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = no_stops1643

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords]
tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and b[2] not in stopwords]

fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(income, tax)",14
1,"(population, decline)",13
2,"(tax, hike)",8
3,"(tax, hikes)",6
4,"(illinois, population)",5
5,"(state, population)",4
6,"(population, growth)",4
7,"(estimated, billion)",4
8,"(would, also)",4
9,"(residents, states)",4


In [38]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(income, tax, hike)",6
1,"(progressive, income, tax)",4
2,"(illinois, policy, institute)",3
3,"(state, income, tax)",3
4,"(illinois, population, decline)",3
5,"(flat, income, tax)",3
6,"(kept, pace, average)",2
7,"(pace, average, state)",2
8,"(average, state, population)",2
9,"(state, population, growth)",2


In [39]:
#### Creating targeted N-Grams


bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'population' or b[1] == 'population')]

tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'population' or b[1] == 'population' or b[2] == 'population')]

# population must be one of the words

fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)



Unnamed: 0,Word,Frequency
0,"(population, decline)",13
1,"(illinois, population)",5
2,"(state, population)",4
3,"(population, growth)",4
4,"(population, would)",2
5,"(population, dropped)",1
6,"(entire, population)",1
7,"(population, naperville)",1
8,"(decade, population)",1
9,"(increase, population)",1


In [41]:


fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)



Unnamed: 0,Word,Frequency
0,"(illinois, population, decline)",3
1,"(average, state, population)",2
2,"(state, population, growth)",2
3,"(population, growth, since)",2
4,"(show, illinois, population)",1
5,"(illinois, population, dropped)",1
6,"(population, dropped, people)",1
7,"(state, entire, population)",1
8,"(entire, population, naperville)",1
9,"(population, naperville, illinois)",1


### Use snorkle to lable the data

In [113]:
# Stuy Snorkle(Week 4)
import sys
print(sys.version)

3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [114]:
import snorkel as sk
import pandas as pd
from snorkel.labeling import labeling_function, LabelModel, PandasLFApplier, LFAnalysis, \
                            LabelingFunction, MajorityLabelVoter, filter_unlabeled_dataframe
from snorkel.analysis import get_label_buckets
from snorkel.preprocess import preprocessor
from snorkel.labeling.lf.nlp import nlp_labeling_function
from snorkel.utils import probs_to_preds

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics



from textblob import TextBlob

import re
import operator

from tqdm.auto import tqdm
tqdm.pandas()

In [142]:
text_snorkel = data[['text']]
#text_snorkel = text_snorkel.rename(columns={'title_clean': 'text'})

### Read this article to find the topic to do classification
https://www.governing.com/topics/finance/sl-illinois-sees-population-decline.html

In [188]:
### Labeling with Regex
ABSTAIN = -1
Tax = 0
Safety = 1
Investment = 2
Health = 3
Weather= 4
Other = 5

In [200]:
#Trying!!
# Labeling functions is a key concept in Snorkel. 
# So you can write as many functions that can contain any logic to label the observation


@labeling_function()
def taxes(x):
    return Tax if re.search(r"tax*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def crime(x):
    return Safety if re.search(r"crim*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def violence(x):
    return Safety if re.search(r"violen*", x.text, flags=re.I) else ABSTAIN
@labeling_function()

def discrimination(x):
    return Safety if re.search(r"discrimin*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def Edu(x):
    return Investment if re.search(r"educat*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def infrastructure(x):
    return Investment if re.search(r"infrastru*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def manufacture(x):
    return Investment if re.search(r"manufactu*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def debt(x):
    return Investment if re.search(r"debt*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def COVID(x):
    return Health if re.search(r"COVID*", x.text, flags=re.I) else ABSTAIN\

@labeling_function()
def covid(x):
    return Health if re.search(r"covid*", x.text, flags=re.I) else ABSTAIN


@labeling_function()
def virus(x):
    return Health if re.search(r"virus*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def pandemic(x):
    return Health if re.search(r"pandemi*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def cold(x):
    return Weather if re.search(r"cold*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def winter(x):
    return Weather if re.search(r"winter*", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def storm(x):
    return Weather if re.search(r"storm*", x.text, flags=re.I) else ABSTAIN

@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return Other if x.polarity > 0.9 else ABSTAIN

@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return Other if x.subjectivity >= 0.5 else ABSTAIN


In [201]:
lfs=[taxes,
     crime,
     violence,
     discrimination,
     Edu,
     infrastructure,
     manufacture,
     debt,
     covid,
     COVID,
     virus,
     pandemic,
     cold,
     winter,
     storm,
     textblob_polarity,
     textblob_subjectivity]


applier = PandasLFApplier(lfs)

# The output of the apply method is a label matrix, which is a # NumPy array L 
# with one column for each LF and one row for each data point
L_train = applier.apply(text_snorkel)

  from pandas import Panel
100%|██████████| 5015/5015 [00:34<00:00, 146.69it/s]


In [202]:
#### Explore labeling results
# Polarity: The set of unique labels this LF outputs (excluding abstains)
# Coverage: The fraction of the dataset the LF labels
# Overlaps: The fraction of the dataset where this LF and at least one other LF label
# Conflicts: The fraction of the dataset where this LF and at least one other LF label and disagree

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
taxes,0,[0],0.990628,0.935992,0.935992
crime,1,[1],0.528415,0.528415,0.528415
violence,2,[1],0.106481,0.106481,0.106481
discrimination,3,[1],0.037687,0.037687,0.037687
Edu,4,[2],0.215952,0.215753,0.215753
infrastructure,5,[2],0.052642,0.052642,0.052642
manufacture,6,[2],0.068395,0.068395,0.068395
debt,7,[2],0.129013,0.129013,0.129013
covid,8,[3],0.456431,0.456431,0.455833
COVID,9,[3],0.456431,0.456431,0.455833


In [203]:
# Explore the records where 'cold' labeled as 'weather'
text_snorkel.iloc[L_train[:, 12] ==Weather]

Unnamed: 0,text,label,classes
3,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria Ma...,4,Weather
4,"As the coronavirus spreads, the Tribune is tracking Illinois cases here »\nHere’s what’s happeni...",3,Health
5,"Flipboard The latest Dr. Ngozi Ezike, director of the Illinois Department of Public Health, at a...",3,Health
6,"Coronavirus in Illinois: How many deaths, cases and tests\npeople have died in Illinois from COV...",3,Health
7,The Latest on the coronavirus pandemic. The new coronavirus causes mild or moderate symptoms for...,3,Health
...,...,...,...
5008,"The fight over COVID-19 has become a legal battle in Illinois, pitting a Republican state lawmak...",3,Health
5009,"This is a work in progress In response to the COVID-19 pandemic, Gov. JB Pritzker has ordered Il...",3,Health
5010,"The fight over COVID-19 has become a legal battle in Illinois, pitting a Republican state lawmak...",3,Health
5011,US & World Illinois Lawmaker Files Lawsuit; Wants Stay-At-Home Rules Lifted Illinois Gov. J.B. P...,3,Health


In [204]:
text_snorkel.text[5010]

'The fight over COVID-19 has become a legal battle in Illinois, pitting a Republican state lawmaker from a rural county against the Democratic governor.\nDarren Bailey argued the state\'s latest stay-at-home order was taking an unfair economic toll on his constituents in Clay County. So he sued last week. And won. Sort of.\nA Clay County circuit court judge issued a temporary restraining order against the state\'s extension of its stay-at-home policy.\nThat ruling only applies to one person, though — Bailey.\nEveryone else living in Clay County, and the rest of the state, still must abide by the 30-day extension set by Gov. J.B. Pritzker.\n"Enough is enough," Bailey said in a statement. "I filed this lawsuit on behalf of myself and my constituents who are ready to go back to work and resume a normal life."\nPritzker is appealing the judge\'s order.\nAt his daily press conference Tuesday, the governor called the lawsuit a "cheap political stunt" and the court decision "dangerous." He sa

###### Baseline model: the majority vote on a per-data point basis

In [205]:
# Convert labels from LF into a single label
majority_model = MajorityLabelVoter(cardinality=6, verbose=True) # cardinality = number of categories
preds_train_majority_model = majority_model.predict(L=L_train)

In [206]:
text_snorkel_majority_model=data[['text']]
text_snorkel_majority_model['label'] = preds_train_majority_model

text_snorkel_majority_model['classes'] = text_snorkel_majority_model.label.replace([-1, 0, 1,2,3,4,5],['ABSTAIN', 'Tax', 'Safety', 'Investment','Health','Weather','Other'])
                                                                           
text_snorkel_majority_model.classes.value_counts()                                                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Health        2324
ABSTAIN       1586
Weather        594
Tax            274
Safety         144
Investment      86
Other            7
Name: classes, dtype: int64

###### More sophisticated Snorkel LabelModel, combining outputs of the LFs

In [207]:
label_model = LabelModel(cardinality=6, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
preds_train_label_model = label_model.predict(L=L_train)
probs_train = label_model.predict_proba(L_train)

In [208]:
preds_train_label_model

array([3, 3, 3, ..., 3, 3, 3])

In [209]:
text_snorkel['label'] = preds_train_label_model


In [210]:
text_snorkel['classes'] = text_snorkel.label.replace([-1, 0, 1,2,3,4,5],['ABSTAIN', 'Tax', 'Safety', 'Investment','Health','Weather','Other'])

In [211]:
text_snorkel.classes.value_counts()

Health        2524
Weather       1846
ABSTAIN        294
Safety         251
Other           58
Investment      42
Name: classes, dtype: int64

## text_snorkel_majority_model makes more sense to me so i will go with this dataset

In [212]:
# pickle text_snorkel_majority_model dataset

with open('text_Labeled.pkl', 'wb') as f:
    pickle.dump(text_snorkel_majority_model, f)

In [10]:
# Read Pickle file as list type
with open("text_Labeled.pkl", "rb") as fp:
    text_snorkel = pickle.load(fp)

###### Filtering out training data points which did not recieve a label from any LF
These data points contain no signal.

In [214]:
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=text_snorkel, y=probs_train, L=L_train
)

In [215]:
### Finished label data

In [216]:
# Find the text with the positive attitude
df_train_filtered['polarity'] = df_train_filtered.apply(lambda x: TextBlob(x['text']).sentiment.polarity, axis=1)
df_train_filtered['subjectivity'] = df_train_filtered.apply(lambda x: TextBlob(x['text']).sentiment.subjectivity, axis=1)
df_train_filtered[['text','classes','polarity', 'subjectivity']][df_train_filtered['polarity'] > 0.8]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,text,classes,polarity,subjectivity


In [217]:
df_train_filtered[['text','classes','polarity', 'subjectivity']][df_train_filtered['polarity'] > 0.3]

Unnamed: 0,text,classes,polarity,subjectivity
83,Chicago gets in on the Apple Maps 'Look Around' action More looking around for more people. What...,Tax,0.322222,0.404444
147,Beautiful Places to Visit in Illinois Burden Falls The Shawnee National Forest is one of America...,ABSTAIN,0.415602,0.635061
343,"This has to be one of the most exciting cities in the world, with so much to offer tourists, bus...",Weather,0.308803,0.478608
481,"Call Today at 866-733-4278, text us at 662-524-9099, or Click HERE to Apply\nPediaStaff is inter...",Safety,0.340038,0.492857
607,"As many of us already receive the 2020 census forms counting all residents from chicago, What ar...",Other,0.65,0.625
608,does anyone know who has more grocery stores in regards to population ratio suburbs vs city?,ABSTAIN,0.5,0.5
941,THE ROLE\nA stellar opportunity to have massive impact: join the rapidly growing team at a compa...,ABSTAIN,0.3175,0.625833
1624,"Things are fine. We got the marijuana and more gambling, so we'll be able to pay all the backlog...",ABSTAIN,0.340923,0.457589
1717,State Rep. Mark Batinick filed legislation he said aims to keep Illinois students in state for c...,ABSTAIN,0.307464,0.469262
1849,CHICAGO - Chicago officials said Wednesday that the number of new coronavirus cases among the La...,Health,0.318182,0.431481


* After label the data by snorkel, we can use sentiment analysis on text instead of title and check the class, also value count are able to give us the idea of the % of each problems.

In [218]:
# Dictionary Based
# Find the text with the Negative attitude
df_train_filtered[['text','classes', 'polarity', 'subjectivity']][df_train_filtered['polarity'] < -0.2]



Unnamed: 0,text,classes,polarity,subjectivity
552,"In order to push forward with in-person elections on Tuesday, local election officials across Il...",Health,-0.208333,0.291667
2528,Other cities and states across the country are implementing similar measures to stem the spread ...,ABSTAIN,-0.225298,0.465079
2722,This is why the steady depopulation of the state of Illinois is shocking. https://t.co/YLHshkguE...,ABSTAIN,-0.416667,0.75
2773,CHICAGO (AP) — Chicago's mayor says the city is launching a health campaign focused on black and...,Health,-0.279762,0.505952
3135,CHICAGO (AP) — Chicago's mayor says the city is launching a health campaign focused on black and...,Health,-0.279762,0.505952
3292,NEWSROOM: Bristol-Myers Squibb Company\nCONTENT: Multimedia with summary\nIn honor of Black Hist...,Weather,-0.255556,0.344444
3294,"Chicago Barbershops are Improving Access to Cancer Screening Feb 13, 2020 1:25 PM ET Tweet This...",Weather,-0.255556,0.344444
4972,"With signs that COVID-19 is rising among Illinois prison inmates and staffers, Gov. J.B. Pritzke...",Health,-0.25,0.75


Analysing negative attitude by classes help to see what problem is the majority problem that people concern about.

In [187]:
df_train_filtered.text[2773]

'CHICAGO (AP) — Chicago\'s mayor says the city is launching a health campaign focused on black and brown communities.\nMayor Lori Lightfoot\'s comments Monday follow a WBEZ report highlighting the disproportionate number of black residents among those who have died of COVID-19 complications in the city.\nLightfoot called the radio station\'s finding that 70% of recorded deaths due to the coronavirus in the city were black residents "devastating."\nThe report said black residents make up 29% of the city\'s population.\nLightfoot noted that black and brown communities\' access to healthcare has been unequal for decades.'

In [219]:
# pickle df_train_filtered 

with open('df_train_filtered.pkl', 'wb') as f:
    pickle.dump(df_train_filtered, f)

In [11]:
# Read Pickle file 
with open("df_train_filtered.pkl", "rb") as fp:
    df_train_filtered = pickle.load(fp)

#### Apply some NER(Using Spacy)
Learn Spacy on Datcamp

### Topic Model (LDA)

In [12]:
import time
import math
import re
from textblob import TextBlob
import pandas as pd

import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim

In [13]:
df_train_filtered.shape

(4995, 5)

In [14]:
text_eng=df_train_filtered

In [15]:
# Remove special characters to avoid problems with analysis
text_eng['text_clean'] = text_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [16]:
pd.set_option('display.max_colwidth', 100)
text_eng[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,"Mayor Lightfoot announces plans for reopening Chicago By Reel Chicago May 11, 2020 0 Mayor Lori ...","Mayor Lightfoot announces plans for reopening Chicago By Reel Chicago May 11, 2020 0 Mayor Lori ..."
1,New Adult-Use Customers Can Only Shop Online or Over-the-Phone for In-Store Pickup Medical Patie...,New AdultUse Customers Can Only Shop Online or OverthePhone for InStore Pickup Medical Patients ...
2,New Adult-Use Customers Can Only Shop Online or Over-the-Phone for In-Store Pickup\nMedical Pati...,New AdultUse Customers Can Only Shop Online or OverthePhone for InStore PickupMedical Patients C...
3,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria Ma...,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria Ma...
4,"As the coronavirus spreads, the Tribune is tracking Illinois cases here »\nHere’s what’s happeni...","As the coronavirus spreads, the Tribune is tracking Illinois cases here Heres whats happening Mo..."


In [17]:
doc_complete=text_eng.text_clean.tolist()

In [18]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]     

In [19]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [20]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [181]:
import warnings

warnings.simplefilter('ignore')
# Three-topic Model
warnings.simplefilter('ignore')

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=3), sep='\n')
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)


CPU times: user 5min 10s, sys: 625 ms, total: 5min 11s
Wall time: 5min 12s
(0, '0.009*"illinois" + 0.008*"state" + 0.007*"chicago"')
(1, '0.007*"chicago" + 0.004*"one" + 0.004*"day"')
(2, '0.014*"said" + 0.010*"covid19" + 0.010*"state"')


In [87]:
# Five-topic Model
#%time ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50) #5 topics
#print(*ldamodel.print_topics(num_topics=5, num_words=5), sep='\n')
#lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
#pyLDAvis.display(lda_display)


### Key_word Exraction 

In [229]:
import os
import sys
print(sys.version)

import re
import pandas as pd
from rake_nltk import Rake

3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [230]:
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

def rake_implement(x,r):
    r.extract_keywords_from_text(x) # r.extract_keywords_from_text(<text to process>)
    return r.get_ranked_phrases() # r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.

In [231]:
df_train_filtered['rake_phrases']=df_train_filtered['text_clean'].apply(lambda x: rake_implement(x,r))

In [232]:
df_train_filtered[['text_clean', 'rake_phrases']].head(5)

Unnamed: 0,text_clean,rake_phrases
0,"Mayor Lightfoot announces plans for reopening Chicago By Reel Chicago May 11, 2020 0 Mayor Lori ...","[influenzalike illness andor covidlike illness case investigation contact tracing, based onincid..."
1,New AdultUse Customers Can Only Shop Online or OverthePhone for InStore Pickup Medical Patients ...,"[instore pickup frequently sanitizing hightouch surfaces deep cleaning, united states private se..."
2,New AdultUse Customers Can Only Shop Online or OverthePhone for InStore PickupMedical Patients C...,"[instore pickup frequently sanitizing hightouch surfaces deep cleaning, statementsthis press rel..."
3,Pope Francis Names Father Louis Tylka of Archdiocese of Chicago as Coadjutor Bishop of Peoria Ma...,"[pope francis names father louis tylka, father tylka attended mundelein seminary, orland park 19..."
4,"As the coronavirus spreads, the Tribune is tracking Illinois cases here Heres whats happening Mo...","[continue holding religious services despite anticoronavirus social distancing requirements, fir..."


In [233]:
#Selecting on RAKE keywords
# Find Article include keywork covid
df_train_filtered['rake_phrases']=df_train_filtered['text'].apply(lambda x: rake_implement(x,r)).apply(', '.join)

df_train_filtered[['text', 'rake_phrases']][df_train_filtered['rake_phrases'].str.contains("covid", na=False)].head(5)

Unnamed: 0,text,rake_phrases
0,"Mayor Lightfoot announces plans for reopening Chicago By Reel Chicago May 11, 2020 0 Mayor Lori ...","developed around three key areas including healthy interactions, illinois ’ “ restore illinois ”..."
1,New Adult-Use Customers Can Only Shop Online or Over-the-Phone for In-Store Pickup Medical Patie...,"expected ,” “ budget ,” “ scheduled ,” “ estimates ,” “ forecasts ,” “ intends ,” “ anticipates ..."
2,New Adult-Use Customers Can Only Shop Online or Over-the-Phone for In-Store Pickup\nMedical Pati...,"united states private securities litigation reform act, filing statement dated december 5, micha..."
4,"As the coronavirus spreads, the Tribune is tracking Illinois cases here »\nHere’s what’s happeni...","city business affairs commissioner rosa escareno said deferring, illinois municipal league execu..."
5,"Flipboard The latest Dr. Ngozi Ezike, director of the Illinois Department of Public Health, at a...","pay rent must ’: chicagoland apartment association fears ‘ tremendous amount, used safety protoc..."


### Entity (organizations and people) identification

I will use Spacy to finish this task 

In [12]:
import pandas as pd
import spacy 
from spacy import displacy
# Load SpaCy model
#nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_lg")

In [13]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [14]:
# Checking active pipeline components
nlp.pipe_names

['tagger', 'parser', 'ner']

In [15]:
df_train_filtered

Unnamed: 0,text,label,classes,polarity,subjectivity
0,Mayor Lightfoot announces plans for reopening ...,3,Health,0.053063,0.396297
1,New Adult-Use Customers Can Only Shop Online o...,3,Health,0.095551,0.389792
2,New Adult-Use Customers Can Only Shop Online o...,3,Health,0.095551,0.389792
3,Pope Francis Names Father Louis Tylka of Archd...,4,Weather,0.057143,0.377143
4,"As the coronavirus spreads, the Tribune is tra...",3,Health,0.092845,0.384135
...,...,...,...,...,...
5010,The fight over COVID-19 has become a legal bat...,3,Health,0.099058,0.500924
5011,US & World Illinois Lawmaker Files Lawsuit; Wa...,3,Health,0.116922,0.523469
5012,One at a time or all at once? Downstate Illino...,3,Health,0.043002,0.393983
5013,Saint Anthony Hospital - Chicago Files Federal...,3,Health,0.007586,0.368336


## Positive Entity Analysis

In [28]:
df_train_filtered_Positive

Unnamed: 0,text,label,classes,polarity,subjectivity


In [62]:
# Using df_train_filtered chose poladity >0.8

# For different classes identify what is the top entity in the list 
#(Negative paragrah group by clasess doing NER)
df_train_filtered_Positive=df_train_filtered[df_train_filtered.polarity>0.16]

In [63]:
df_train_filtered_Positive

Unnamed: 0,text,label,classes,polarity,subjectivity
37,Share Description We're fast becoming the nati...,-1,ABSTAIN,0.197281,0.458120
39,Data Research Internship at University of Chic...,-1,ABSTAIN,0.232421,0.472767
45,"Pam Maxey, of Texico, had tears streaming down...",-1,ABSTAIN,0.232506,0.432388
48,Chicago State University’s leadership transfor...,-1,ABSTAIN,0.167260,0.373519
49,2:44 PM Students walk toward their lunchroom ...,-1,ABSTAIN,0.161780,0.342110
...,...,...,...,...,...
4964,Creating IT Futures Launches CompTIA Tech Care...,3,Health,0.176540,0.390616
4973,Creating IT Futures Launches CompTIA Tech Care...,3,Health,0.182094,0.392837
4983,Chicago Doctors See “Truly Remarkable” Success...,3,Health,0.195671,0.472186
4989,Google CEO Tells Employees Return to Office Wo...,-1,ABSTAIN,0.164865,0.421171


In [64]:
df_train_filtered_Positive_Health=df_train_filtered_Positive[df_train_filtered_Positive.classes =='Health']
df_train_filtered_Positive_Health.shape

(179, 5)

In [65]:
df_train_filtered_Positive_Weather=df_train_filtered_Positive[df_train_filtered_Positive.classes =='Weather']
df_train_filtered_Positive_Weather.shape

(109, 5)

In [66]:
df_train_filtered_Positive_Tax=df_train_filtered_Positive[df_train_filtered_Positive.classes =='Tax']
df_train_filtered_Positive_Tax.shape

(40, 5)

In [67]:
df_train_filtered_Positive_Safety=df_train_filtered_Positive[df_train_filtered_Positive.classes =='Safety']
df_train_filtered_Positive_Safety.shape

(10, 5)

In [68]:
df_train_filtered_Positive_Investment=df_train_filtered_Positive[df_train_filtered_Positive.classes =='Investment']
df_train_filtered_Positive_Investment.shape

(18, 5)

In [69]:
#Find the Top 20 Organization for Positive articles in regard to the Health Issues




# Concatenate strings 
text=df_train_filtered_Positive_Health.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
COVID-19,253
Sanders,115
BOP,47
CNN,43
Lightfoot,39
Pritzker,31
CDC,28
GM,25
Vistra,25
the Illinois Department of Public Health,24


In [71]:
#Find the Top 20 Nationalities or religious or political groups for Positive articles in regard to the Health Issues




# Concatenate strings 
text=df_train_filtered_Positive_Health.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Democratic,68
Americans,43
Republican,34
Meridian,27
Illinoisans,20
American,18
Trump,16
Hispanic,14
African Americans,13
Latinx,10


In [72]:
#Find the Top 20 people for Positive articles in regard to the Health Issues




# Concatenate strings 
text=df_train_filtered_Positive_Health.text.str.cat(sep=' ')

doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Pritzker,191
Biden,151
COVID-19,45
J.B. Pritzker,40
JB Pritzker,31
Ngozi Ezike,29
Lori Lightfoot,26
Joe Biden,24
Donald Trump,24
Bernie Sanders,18


In [73]:
#####################################################################################################################################################################

In [74]:
# Find the Top 20 Organization for Positive articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Positive_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Body Shops,57
Hotels,56
TX\n,32
Home Chef,31
IHDA,25
NPR,25
Alton,18
Columbia,17
HOST,16
University,14


In [75]:
# Find the Top 20 Nationalities or religious or political groups for Positive articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Positive_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Thai,83
Democratic,18
Republican,15
French,14
Polish,13
Turkish,13
Hispanic,12
American,10
Italian,10
Chicagoans,9


In [76]:
# Find the Top 20 people for Positive articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Positive_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Hail,26
ROBERTO GONZALEZ,18
Farmer,18
Stanley Gehrt,16
Lewis,16
IL,15
Serving Brunch,14
Patch,13
Pritzker,12
Ziegler,10


In [77]:
############################################################################################################################################################

In [78]:
# Find the Top 20 Organization for Positive articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Positive_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
STI,7
Impact Advisors,7
Apple,7
CPF,6
Lightfoot,5
AP,5
Chicago Department of Public Health,5
Chronic Wasting Disease,4
House,4
Chicago Pacific Founders,4


In [79]:
# Find the Top 20 Nationalities or religious or political groups for Positive articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Positive_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Democratic,7
Chicagoans,4
Irish,4
Hispanic,3
Illinoisans,2
African American,2
Asian,2
Spanish,2
Midwestern,2
Irish-American,1


In [80]:
# Find the Top 20 people for Positive articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Positive_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)




Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Allison Arwady,7
Lori Lightfoot,6
O'Shea,6
Brian Mackey,5
J.B. Pritzker,4
Pritzker,4
Sam Dunklau,3
Biden,3
IL,3
Latino,3


In [None]:
#####################################################################################################################################################################

In [81]:
# Find the Top 20 Organization for Positive articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Positive_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
PediaStaff,6
Valach,3
Champaign,2
Mah,2
BCC,2
Brookings,2
Chicago Booth,2
STEM,2
The Trace,1
Supreme Court,1


In [82]:
# Find the Top 20 Nationalities or religious or political groups for Positive articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Positive_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Chicagoans,1
French,1
Italian,1
Japanese,1
Southeast Asian,1
Southeast Asians,1
Special,1
american,1


In [83]:
# Find the Top 20 people for Positive articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Positive_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)




Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Martin,5
Booth,3
Katelyn Newman,2
Theresa Valach,2
Lori Lightfoot,2
Madison,2
Joshua,2
Click HERE,2
Josh,2
Ram Villivalam,1


In [None]:
#####################################################################################################################################################################

In [84]:
# Find the Top 20 Organization for Positive articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Positive_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Associated Press,10
Everstream,9
Herrin,7
Postweiler,5
SCD,5
NPR,5
Ryker,4
the Palos Heights Public Library,3
the Iowa Democratic Party,3
Pritzker,3


In [85]:
# Find the Top 20 Nationalities or religious or political groups for Positive articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Positive_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Democrats,9
Republican,7
Democratic,3
American,2
BA,2
Cheese,2
Chicagoans,2
Illinoisans,2
North American,2
CT,1


In [86]:
# Find the Top 20 people for Positive articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Positive_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)




Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Merritt,8
Mao,6
Clark,4
Pritzker,4
Jamieson,4
Bob Clark,3
McKiernan,3
JB Pritzker,3
Brian McKiernan,3
J.B. Pritzker,3


## Negative Entity Analysis

In [16]:
# Using df_train_filtered chose poladity <0.019

# For different classes identify what is the top entity in the list 
#(Negative paragrah group by clasess doing NER)
df_train_filtered_Negative=df_train_filtered[df_train_filtered.polarity<0.019]

In [17]:
df_train_filtered_Negative

Unnamed: 0,text,label,classes,polarity,subjectivity
10,May 11 (UPI) -- Illinois Gov. J.B. Pritzker an...,3,Health,-0.004274,0.365755
32,CHICAGO -- Pope Francis announced Monday that ...,-1,ABSTAIN,0.000000,0.243750
41,"By Ben Szalinski, Illinois Policy Institute | ...",-1,ABSTAIN,-0.050526,0.572924
46,Home price growth stalled last fall in the Chi...,0,Tax,0.015040,0.347513
55,"Authored by Elizabeth Bauer via Forbes.com,\nI...",-1,ABSTAIN,0.011320,0.508948
...,...,...,...,...,...
4975,"CHICAGO , April 28, Saint Anthony Hospital has...",3,Health,0.003504,0.364095
4976,Saint Anthony Hospital has filed suit against ...,3,Health,0.003504,0.364095
4984,Job Title: English Placement Test (EPT) Coordi...,-1,ABSTAIN,0.012946,0.278571
4991,Judge slaps down Illinois governor's extended ...,3,Health,0.014909,0.461058


In [18]:
df_train_filtered_Negative_Health=df_train_filtered_Negative[df_train_filtered_Negative.classes =='Health']
df_train_filtered_Negative_Health.shape

(245, 5)

In [19]:
df_train_filtered_Negative_Weather=df_train_filtered_Negative[df_train_filtered_Negative.classes =='Weather']
df_train_filtered_Negative_Weather.shape

(45, 5)

In [20]:
df_train_filtered_Negative_Tax=df_train_filtered_Negative[df_train_filtered_Negative.classes =='Tax']
df_train_filtered_Negative_Tax.shape

(46, 5)

In [21]:
df_train_filtered_Negative_Safety=df_train_filtered_Negative[df_train_filtered_Negative.classes =='Safety']
df_train_filtered_Negative_Safety.shape

(34, 5)

In [22]:
df_train_filtered_Negative_Investment=df_train_filtered_Negative[df_train_filtered_Negative.classes =='Investment']
df_train_filtered_Negative_Investment.shape

(6, 5)

### Find the top NER in Each Group

* Find the Top 20 Organization for Negative articles in regard to the Health Issues



In [152]:

# Concatenate strings 
text=df_train_filtered_Negative_Health.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
COVID-19,331
Lightfoot,188
State,83
AP,67
Medicaid,52
WBEZ,48
Raoul,44
CDC,44
Chicago Sun-Times,42
Soldier Field,34


* Find the Top 20 Nationalities or religious or political groups for Negative articles in regard to the Health Issues



In [153]:

# Concatenate strings 
text=df_train_filtered_Negative_Health.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Americans,75
African,35
American,26
African Americans,20
Chicagoans,18
African American,17
Illinoisans,16
Democrats,15
Republican,13
Iranian,12


* Find the Top 20 people for Negative articles in regard to the Health Issues



In [154]:

# Concatenate strings 
text=df_train_filtered_Negative_Health.text.str.cat(sep=' ')

doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Pritzker,143
Lori Lightfoot,110
J.B. Pritzker,84
COVID-19,75
Saint Anthony,69
Coronavirus,46
Rezin Garcia,40
Jail,30
Ngozi Ezike,29
JB Pritzker,27


In [155]:
########################################################################################################################################################

In [156]:
# Find the Top 20 Organization for Negative articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Negative_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
"Battalion,130th Infantry",18
130th Infantry Regiment,15
The Illinois Army National Guard,14
AP,12
EIS,10
Senate,9
130th Infantry,9
House,9
Control,8
a Chicago Animal Care,8


In [157]:
# Find the Top 20 Nationalities or religious or political groups for Negative articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Negative_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Republican,13
Democrats,8
Democratic,8
Midwestern,8
Jews,7
American,7
Mississippian,5
European,4
Americans,4
Mississippians,3


In [158]:
# Find the Top 20 people for Negative articles in regard to the Weather Issues

# Concatenate strings 
text=df_train_filtered_Negative_Weather.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Laurie Skrivan,18
Rod Blagojevich,6
Caleb Parson,6
COGFA,6
Pritzker,6
Chris Sweda,5
Obama,4
Macey Parson,4
Parson,4
Simpson,4


In [159]:
#################################################################################################################################################

In [160]:
# Find the Top 20 Organization for Negative articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Negative_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
TW,10
AGI,8
Single-Family Homes,4
Congress,4
the U.S. Census Bureau,3
U.S. Census,3
NY,3
the Illinois Senate,3
CBS 2,3
Labor Force Participation Rate for,3


In [161]:
# Find the Top 20 Nationalities or religious or political groups for Negative articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Negative_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Annual,2
Interim,1
Poles,1
Polish,1
Trump,1


In [162]:
# Find the Top 20 people for Negative articles in regard to the Tax Issues

# Concatenate strings 
text=df_train_filtered_Negative_Tax.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)



Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
IL,6
Greg Bishop,4
Ronnie Howard,4
John Klingner,3
Austin,3
Ted Dabrowski,3
J.B. Pritzker,2
Miller,2
CA,2
Tim Gleason,2


In [163]:
####################################################################################################################################################################

In [164]:
# Find the Top 20 Organization for Negative articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Negative_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
CTA,17
Evanston,15
CPD,12
Pronouns,11
Parris,10
FBI,8
Lightfoot,7
Waterloo,6
the Democratic Party,6
IFA,5


In [165]:
# Find the Top 20 Nationalities or religious or political groups for Negative articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Negative_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Americans,10
Democrats,10
American,8
Democratic,8
Islamic,7
Democrat,6
Tyrese,6
African American,6
Chicagoans,5
African,4


In [166]:
# Find the Top 20 people for Negative articles in regard to the Safety Issues

# Concatenate strings 
text=df_train_filtered_Negative_Safety.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)




Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Van Dyke,9
Ramada,7
Lori Lightfoot,6
Morris,6
Jim Crow,6
Pritzker,6
Wendt,6
Jason Van Dyke,5
Simmons,4
Hizb,4


In [167]:
################################################################################################################################################

In [168]:
# Find the Top 20 Organization for Negative articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Negative_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'ORG']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Navy,14
NYPD,14
FBI,10
Minter,9
Print,8
Tribune,7
the New York Times,6
NAACP,5
Port Chicago,4
Heather Mac,4


In [169]:
# Find the Top 20 Nationalities or religious or political groups for Negative articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Negative_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'NORP']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
American,16
African American,12
Americans,9
Jews,6
Jewish,6
African,4
Hispanics,4
Black Israelite,3
Catholic,3
Covington Catholic,2


In [170]:
# Find the Top 20 people for Negative articles in regard to the Investment Issues

# Concatenate strings 
text=df_train_filtered_Negative_Investment.text.str.cat(sep=' ')


doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels})

df=df[df.Labels == 'PERSON']
df['Entities']=df['Entities'].astype(str)

counts_df = df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)



Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Print,9
Smollett,7
Williams,7
Brown,6
Ilhan Omar,5
Anderson,5
Graham Rayman,4
Madison Harris,4
D'Adrien Anderson,4
de Blasio,4
