In [17]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import spacy
import string
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from gensim.models import Word2Vec

In [60]:
df = pd.read_csv('comment_backToWork.csv', encoding="ISO-8859-1")
df = df[pd.notnull(df['pick'])]
df.rename(columns=lambda x: x.strip(), inplace=True)
df = df.drop_duplicates(subset='comment', keep="first")

In [75]:
df.comment.head()

0    Doesn't the targeted approach require massive ...
1    I?m at a loss. I made it through 9/11, Super S...
2    I'm 76 years old. How incredibly selfish it wo...
3    I have zero faith that anyone even remotely at...
4    The Katz/ vertical plan has a serious flaw: Ma...
Name: comment, dtype: object

In [20]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
df['state_abbrev'] = df['state'].map(us_state_abbrev).fillna(df['state'])

In [21]:
df.head()

Unnamed: 0,comment,pick,state,state_abbrev
0,Doesn't the targeted approach require massive ...,NYT Picks,Utah,UT
1,"I?m at a loss. I made it through 9/11, Super S...",NYT Picks,NY,NY
2,I'm 76 years old. How incredibly selfish it wo...,NYT Picks,NC,NC
3,I have zero faith that anyone even remotely at...,NYT Picks,SC,SC
4,The Katz/ vertical plan has a serious flaw: Ma...,NYT Picks,Utah,UT


In [22]:
df.isnull().sum()

comment          0
pick             0
state           57
state_abbrev    57
dtype: int64

In [23]:
df.state_abbrev.value_counts()

CA             176
NY             135
MA              57
PA              36
NJ              32
              ... 
Finland          1
NB               1
Los Angeles      1
MT               1
IW               1
Name: state_abbrev, Length: 93, dtype: int64

In [24]:
df.shape

(1025, 4)

In [25]:
df.head()

Unnamed: 0,comment,pick,state,state_abbrev
0,Doesn't the targeted approach require massive ...,NYT Picks,Utah,UT
1,"I?m at a loss. I made it through 9/11, Super S...",NYT Picks,NY,NY
2,I'm 76 years old. How incredibly selfish it wo...,NYT Picks,NC,NC
3,I have zero faith that anyone even remotely at...,NYT Picks,SC,SC
4,The Katz/ vertical plan has a serious flaw: Ma...,NYT Picks,Utah,UT


In [26]:
state_group = df.groupby('state_abbrev')['comment'].count().reset_index().sort_values('comment', ascending=False).head(15)

fig = px.bar(state_group, x='state_abbrev', y='comment')
fig.update_layout(title_text='Numer of comments by state top 15', template="plotly_white")
fig.show()

In [27]:
df.pick.value_counts()

Reader Picks    1009
NYT Picks         16
Name: pick, dtype: int64

In [28]:
def comment_len(x):
    if type(x) is str:
        return len(x.split())
    else:
        return 0

In [29]:
df['comment_len'] = df['comment'].apply(comment_len)
nums_comment = df.query('comment_len > 0')['comment_len']

fig = ff.create_distplot(hist_data = [nums_comment], group_labels = ['Comment'])
fig.update_layout(title_text='Distribution of word count in comment', template="plotly_white")
fig.show()

In [30]:
CA_comment = df.query("comment_len > 0 and state_abbrev == 'CA'")['comment_len']
NY_comment = df.query("comment_len > 0 and state_abbrev == 'NY'")['comment_len']
MA_comment = df.query("comment_len > 0 and state_abbrev == 'MA'")['comment_len']
PA_comment = df.query("comment_len > 0 and state_abbrev == 'PA'")['comment_len']
NJ_comment = df.query("comment_len > 0 and state_abbrev == 'NJ'")['comment_len']

fig = ff.create_distplot(hist_data=[CA_comment, NY_comment, MA_comment, PA_comment, NJ_comment],
                         group_labels=['CA', 'NY', 'MA', 'PA', 'NJ'],
                         colors=px.colors.qualitative.Plotly[5:], show_hist=False)

fig.update_layout(title_text="Comment word count vs. State top 5", xaxis_title='word count', template="plotly_white")
fig.show()

In [31]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(df['comment'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['word' , 'count'])

people 728
virus 357
economy 338
need 322
work 302
health 244
don 239
testing 228
just 222
time 218
friedman 208
like 198
weeks 182
risk 178
medical 171
know 167
care 164
approach 158
test 156
going 155


In [32]:
fig = px.bar(df1, x='word', y='count')
fig.update_layout(title_text='Comment word count top 20', template="plotly_white")
fig.show()

In [33]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['comment'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['word' , 'count'])

covid 19 99
mr friedman 74
health care 71
dr katz 58
herd immunity 54
public health 52
stay home 51
don know 45
south korea 38
high risk 35
fatality rate 35
long term 32
social distancing 32
people work 29
people die 29
young people 26
mortality rate 26
million people 25
flatten curve 23
death rate 23


In [34]:
fig = px.bar(df2, x='word', y='count')
fig.update_layout(title_text='Comment bigram count top 20', template="plotly_white")
fig.show()

In [35]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['comment'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['word' , 'count'])

test test test 8
high risk people 7
paid sick leave 7
pre existing conditions 7
health care professionals 6
thank mr friedman 6
dr katz approach 6
return work school 5
health care workers 5
emphysema dies lack 5
low fatality rate 5
high risk groups 5
public health officials 5
high risk group 5
dr katz makes 5
forced stay home 4
patient chest pain 4
patient advanced emphysema 4
advanced emphysema dies 4
dies lack facility 4


In [36]:
fig = px.bar(df3, x='word', y='count')
fig.update_layout(title_text='Comment trigram count top 20', template="plotly_white")
fig.show()

In [37]:
def print_comment(index):
    example = df[df.index == index][['comment', 'state_abbrev']].values[0]
    if len(example) > 0:
        print(example[0])
        print('state_abbrev:', example[1])

In [38]:
print_comment(0)

Doesn't the targeted approach require massive testing? Yes, it does. And we're not there yet. Until we are this advice seems premature.
state_abbrev: UT


In [39]:
print_comment(100)

Public health professor here. With all due respect to the experts mentioned in the article, the best simulation studies suggest that the mortality rate is higher than 1%. But even those high quality simulations are difficult to interpret because we don't have adequate data on incidence or prevalence. And we don't have adequate data because we don't have adequate testing equipment. And we don't have adequate testing equipment because our health care "system" is in tatters, in part because various industries (pharmaceutical, medical device, insurance) have bribed our lawmakers and handcuffed our clinicians (physicians, nurses, etc). As a result of third party intrusions, our clinicians can no longer deliver high quality care and have unprecedented rates of burn out and moral distress. If this unmitigated COVID 19 disaster is not sufficient to move this country toward universal health care, nothing will (at least in my lifetime). Tom, if you want to use your pulpit to jumpstart the econom

In [40]:
print_comment(1000)

First time ever I agree with Ton Friedman. But, even in these deadly serious times, he could not resist delivering his message without taking a poke at the President. And for what? For following the advice of the academia know it all know nothings - Tom's favorite people.
state_abbrev: USA


In [41]:
df['comment_polarity'] = df['comment'].map(lambda text: TextBlob(text).sentiment.polarity)
df['comment_polarity'].describe()

count    1025.000000
mean        0.076865
std         0.153867
min        -0.800000
25%         0.000000
50%         0.073333
75%         0.151389
max         1.000000
Name: comment_polarity, dtype: float64

In [42]:
nums_polarity = df.query('comment_polarity != 1000')['comment_polarity']

fig = ff.create_distplot(hist_data = [nums_polarity], group_labels = ['Comment polarity'])
fig.update_layout(title_text='Distribution of sentiment polarity in comment', template="plotly_white")
fig.show()

In [43]:
fig = px.histogram(df, x="comment_polarity")
fig.update_layout(title_text='Distribution of sentiment polarity in comment', template="plotly_white")
fig.show()

In [45]:
CA_polarity = df.query("comment_polarity != 1000 and state_abbrev == 'CA'")['comment_polarity']
NY_polarity = df.query("comment_polarity != 1000 and state_abbrev == 'NY'")['comment_polarity']
MA_polarity = df.query("comment_polarity != 1000 and state_abbrev == 'MA'")['comment_polarity']
PA_polarity = df.query("comment_polarity != 1000 and state_abbrev == 'PA'")['comment_polarity']
NJ_polarity = df.query("comment_polarity != 1000 and state_abbrev == 'NJ'")['comment_polarity']

fig = ff.create_distplot(hist_data=[CA_polarity, NY_polarity, MA_polarity, PA_polarity, NJ_polarity],
                         group_labels=['CA', 'NY', 'MA', 'PA', 'NJ'],
                         colors=px.colors.qualitative.Plotly[5:], show_hist=False)

fig.update_layout(title_text="Comment sentiment polarity vs. State top 5", xaxis_title='sentiment polarity', template="plotly_white")
fig.show()

In [46]:
df4 = df.loc[(df['state_abbrev'] == 'CA') | (df['state_abbrev'] == 'NY') | (df['state_abbrev'] == 'MA') | (df['state_abbrev'] == 'PA') | (df['state_abbrev'] == 'NJ')]
polarity_group = df4.groupby('state_abbrev')['comment_polarity'].mean().reset_index()

colors = ['lightslategray',] * 5
colors[2] = 'crimson'
fig = go.Figure(data=[go.Bar(
    x=polarity_group['state_abbrev'].unique(),
    y=polarity_group['comment_polarity'],
    marker_color=colors 
)])
fig.update_layout(title_text='Lowest average sentment polarity')

In [47]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    # Remove a sentence if it is only one word long
    if len(text) > 2:
        return ' '.join(word for word in text.split() if word not in STOPWORDS)

df_clean = pd.DataFrame(df.comment.apply(lambda x: clean_text(x)))

In [48]:
nlp = spacy.load('en', disable=['ner', 'parser']) 

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

df_clean["comment_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['comment']), axis=1)
df_clean['comment_lemmatize'] = df_clean['comment_lemmatize'].str.replace('-PRON-', '')

In [49]:
sentences = [row.split() for row in df_clean['comment_lemmatize']]

w2v_model = Word2Vec(min_count=20,
                     window=5,
                     size=100,
                     workers=4)

w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)


Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).



(145334, 268345)

In [50]:
w2v_model.init_sims(replace=True)

In [51]:
w2v_model.wv.most_similar(positive=['people'])

[('other', 0.999828577041626),
 ('require', 0.9998206496238708),
 ('small', 0.9998093843460083),
 ('family', 0.9998053908348083),
 ('group', 0.9998005628585815),
 ('live', 0.9997981786727905),
 ('may', 0.9997856616973877),
 ('every', 0.9997822642326355),
 ('many', 0.999780535697937),
 ('need', 0.9997801184654236)]

In [52]:
w2v_model.wv.most_similar(positive=['virus'])

[('may', 0.9998370409011841),
 ('include', 0.9998369812965393),
 ('use', 0.9998334050178528),
 ('impact', 0.999821126461029),
 ('need', 0.9998197555541992),
 ('many', 0.9998196363449097),
 ('way', 0.999817430973053),
 ('likely', 0.9998144507408142),
 ('still', 0.9998128414154053),
 ('one', 0.9998122453689575)]

In [53]:
w2v_model.wv.most_similar(positive=['economy'])

[('life', 0.999870777130127),
 ('society', 0.9998369216918945),
 ('one', 0.9998353719711304),
 ('month', 0.999813973903656),
 ('try', 0.9998136162757874),
 ('make', 0.999812126159668),
 ('face', 0.9998112320899963),
 ('continue', 0.9998108148574829),
 ('everything', 0.9998078346252441),
 ('another', 0.9998077750205994)]

In [54]:
w2v_model.wv.similarity('health', 'economy')

0.99956435

In [55]:
model_wv_df = pd.DataFrame(w2v_model[w2v_model.wv.vocab], list(w2v_model.wv.vocab))


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [56]:
keywords = ["economy", "health", "people", "virus",\
            "need", "work", "testing", "friedman",\
            "risk", "medical", "care", 'infect', 'president', 'approach', 'week', 'know']
words = [word for word in keywords if word in list(w2v_model.wv.vocab)]

In [57]:
X = model_wv_df.T[words].T
pca = PCA(n_components=2)
result = pca.fit_transform(X)

df = pd.DataFrame(result, columns=["Component 1", "Component 2"])
df["Word"] = keywords
df["Distance"] = np.sqrt(df["Component 1"]**2 + df["Component 2"]**2)
fig = px.scatter(df, x="Component 1", y="Component 2", text="Word", color="Distance", color_continuous_scale="agsunset",size="Distance")
fig.update_traces(textposition='top center')
fig.layout.xaxis.autorange = True
fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.update_layout(height=800, title_text="2D PCA of Word2Vec embeddings", template="plotly_white", paper_bgcolor="#f0f0f0")
fig.show()