<a href="https://colab.research.google.com/github/ykim71/thesis_related/blob/main/rule_based_matching_pos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# load files/set googledrive

In [1]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/'MyDrive'/CrowdTangle/final_data_metrics

/content/drive/MyDrive/CrowdTangle/final_data_metrics


In [3]:
!pip install --upgrade pandas==1.3.4

Collecting pandas==1.3.4
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 7.9 MB/s 
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
Successfully installed pandas-1.3.4


# POS tagging

In [3]:
import pickle
import pandas as pd

d_pages_text_only = pd.read_pickle("d_pages_text_only.pkl")

In [None]:
#test = d_pages_text_only.sample(100)

In [9]:
import spacy
from spacy import displacy

from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

## customize hypenated words

def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
    )

    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)



sp = spacy.load("en_core_web_sm", disable=["tok2vec", "ner", "lemmatizer"])
sp.tokenizer = custom_tokenizer(sp)

# sample = """ 
# Sore loser libs trolled with #NotMySuperBowlChamps, blame victory on white privilege. Déjà vu?  "The #Patriots only won because of white privilege - none of you gonna sell your ring to feed hungry black people.
# """
# doc = sp(sample)
# #print([t.text for t in doc])

# for token in doc:
#     print(token.text, token.pos_, token.tag_, token.dep_)

In [6]:

import numpy as np
dfs = np.array_split(d_pages_text_only, 30)


In [10]:
# for unigrams

for num in range(0,30):
  
  sample = dfs[num]['text'].values.tolist()
  
  unigrams_text = []
  unigrams_tag = []

  for sen in sample:
    doc = sp(sen)

    for i in range(len(doc)):
      if ( (doc[i].pos_ == "ADJ" or doc[i].pos_ == "NOUN" or doc[i].pos_ == "PROPN")  ):
          unigrams_text.append( str( doc[i].text ) )
          unigrams_tag.append( str( doc[i].pos_ ) )
          
  temp = pd.DataFrame({'text': unigrams_text, 'pos_tag': unigrams_tag })
  
  temp2  = temp.groupby(['text']).size().reset_index(name='counts')
  temp2.to_csv("./dem_pos_unigrams/temp_dem_d"+str(num)+".csv")


KeyboardInterrupt: ignored

In [None]:
# for bigrams
import csv
import pandas as pd


for num in range(0,30):
  
  sample = dfs[num]['text'].values.tolist()
  
  bigram_text = []
  bigram_tag = []

  for sen in sample:
    
    doc = sp(sen)
    for i in range(len(doc)):
      j = i+1
      if j < len(doc):
        if ( (doc[i].pos_ == "ADJ" or doc[i].pos_ == "NOUN" or doc[i].pos_ == "PROPN") and (doc[j].pos_ == "ADJ" or doc[j].pos_ == "NOUN" or doc[j].pos_ == "PROPN") ):
          bigram_text.append( str( doc[i].text + " " + doc[j].text ) )
          bigram_tag.append( str( doc[i].pos_ + " " + doc[j].pos_ ) )
          
  temp = pd.DataFrame({'text': bigram_text, 'pos_tag': bigram_tag })
  
  temp2  = temp.groupby(['text']).size().reset_index(name='counts')
  temp2.to_csv("./dem_pos_bigrams/temp_dem_d"+str(num)+".csv")




# merge all csvs

In [None]:


import glob
import os
import pandas as pd   

path = r'./dem_pos_bigrams/'                     # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f, header=0) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)




In [None]:
concatenated_df.sample(3)

Unnamed: 0.1,Unnamed: 0,text,counts
4499361,453072,feeding operation,2
13578461,665575,© Greg,5
8303890,325752,VOTING LONG,1


In [None]:
df_merge = concatenated_df.groupby(['text'])['counts'].agg('sum').reset_index()

In [None]:
df_merge.sample(3)

Unnamed: 0,text,counts
35389,# GovernmentContractor,1
5060267,dehumanizing dragnet,1
785348,COLLECTIVE IDIOCY,2


In [None]:
df_merge.to_csv("dem_pos_pos_bigrams_all.csv")

In [None]:
len(df_merge)

8150011

In [None]:
df_merge_n10 = df_merge.loc[df_merge['counts'] >= 10]
df_merge_n100 = df_merge.loc[df_merge['counts'] >= 100]
df_merge_n1k = df_merge.loc[df_merge['counts'] >= 1000]


In [None]:
len(df_merge_n100)

63405

In [None]:
len(df_merge_n1k)

5721

In [None]:
df_merge_n1k.to_csv("dem_pos_bigrams_all_n1k.csv")

In [None]:
df_merge_n100.to_csv("dem_pos_bigrams_all_n100.csv")

In [None]:


import glob
import os
import pandas as pd   

path = r'./rep_pos_bigrams/'                     # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f, header=0) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)




In [None]:
df_merge = concatenated_df.groupby(['text'])['counts'].agg('sum').reset_index()

In [None]:
len(df_merge)

6835851

In [None]:
df_merge.to_csv("rep_pos_pos_bigrams_all.csv")

In [None]:
df_merge.sample(2)

Unnamed: 0,text,counts
449692,BUCKET LIST,6
6824128,😳 Flashback,4


In [None]:
import pandas as pd

dem_pos_pos_bigrams_all = pd.read_csv("dem_pos_pos_bigrams_all.csv")

In [None]:
rep_pos_pos_bigrams_all = pd.read_csv("rep_pos_pos_bigrams_all.csv")

In [None]:
dem_pos_pos_bigrams_all.rename(columns={'text':'text', 'counts': 'count_dem'}, inplace=True)
rep_pos_pos_bigrams_all.rename(columns={'text':'text', 'counts': 'count_rep'}, inplace=True)


In [None]:
dem_pos_pos_bigrams_all = dem_pos_pos_bigrams_all[['text','count_dem']]
rep_pos_pos_bigrams_all = rep_pos_pos_bigrams_all[['text','count_rep']]

In [None]:
dem_pos_pos_bigrams_all['count_dem'] = dem_pos_pos_bigrams_all['count_dem'].astype(int)
rep_pos_pos_bigrams_all['count_rep'] = rep_pos_pos_bigrams_all['count_rep'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df3_merged = pd.merge(dem_pos_pos_bigrams_all, rep_pos_pos_bigrams_all, how="outer", on='text')


In [None]:
import numpy as np

df3_merged = df3_merged.replace(np.nan, 0)

In [None]:
df3_merged['count_sum'] = df3_merged['count_dem'] + df3_merged['count_rep']

In [None]:
len(df3_merged)

13106677

In [None]:
df3_merged_select = df3_merged[df3_merged['count_dem'] >= 100]

In [None]:
df3_merged_select.sample(4)

Unnamed: 0,text,count_dem,count_rep,count_sum
6338249,litmus test,773.0,506.0,1279.0
1529356,Family Planning,510.0,134.0,644.0
3899505,Teresa Kumar,140.0,5.0,145.0
1709329,General Meeting,118.0,36.0,154.0


In [None]:
len(df3_merged_select)

63405

In [None]:
df3_merged_select.to_csv("pos_bigrams_all.csv")

## pre-determined NER

In [None]:
ner = pd.read_excel("/Users/yujinkim/Downloads/CrowdTangle/language_analysis/NER/ner/ner_count_m100_all_review_wip_update.xlsx")


# test - examples

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
sample = """
Daily Kos The AP reports that at least 65,000 bridges are "structurally deficient" and over 20,000 more are "fracture critical," meaning that the failure of one component could lead to a collapse, as we saw in the I-5 bridge over the Skagit River in Washington state last spring. Join Daily Kos and the Democratic Senatorial Campaign Committee (DSCC) to tell Senate Republicans: Fix our roads and bridges instead of shutting down our government. AS THE GOP PREVENTS US FROM REBUILDING BADLY DECAYING INFRASTRUCTURE BECAUSE IT WILL CREATE JOBS AND IMPROVE THE ECONOMY, WHICH THEY DO NOT WANT, THEY HOPE THAT EVERY BRIDGE COLLAPSE WILL BE BLAMED ON OBAMA. YEAH.... THANKS OBAMA! HOW STUPID DO THEY THINK WE ARE? Tell Senate Republicans: Fix our roads and bridges instead of shutting down our government  Republicans are playing games with our government to appeal to their right-wing base—pushing us to the brink of a government shutdown in order to score a few points with the Tea Party. While they’re playing games of legislative chicken, our country is facing a major infrastructure crisis. The AP rep... 
"""
doc = nlp(sample)
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)


 SPACE _SP 
Daily PROPN NNP compound
Kos PROPN NNP ROOT
The DET DT det
AP PROPN NNP nsubj
reports VERB VBZ ROOT
that SCONJ IN mark
at ADV RB advmod
least ADV RBS advmod
65,000 NUM CD nummod
bridges NOUN NNS nsubj
are AUX VBP ccomp
" PUNCT `` punct
structurally ADV RB advmod
deficient ADJ JJ acomp
" PUNCT '' punct
and CCONJ CC cc
over ADP IN quantmod
20,000 NUM CD nsubj
more ADJ JJR amod
are AUX VBP conj
" PUNCT `` punct
fracture NOUN NN amod
critical ADJ JJ acomp
, PUNCT , punct
" PUNCT '' punct
meaning VERB VBG advcl
that SCONJ IN mark
the DET DT det
failure NOUN NN nsubj
of ADP IN prep
one NUM CD nummod
component NOUN NN pobj
could VERB MD aux
lead VERB VB ccomp
to ADP IN prep
a DET DT det
collapse NOUN NN pobj
, PUNCT , punct
as SCONJ IN mark
we PRON PRP nsubj
saw VERB VBD advcl
in ADP IN prep
the DET DT det
I-5 PROPN NNP punct
bridge NOUN NN pobj
over ADP IN prep
the DET DT det
Skagit PROPN NNP compound
River PROPN NNP pobj
in ADP IN prep
Washington PROPN NNP compound
state NOUN N

In [None]:
import spacy
sample = """
Daily Kos The AP reports that at least 65,000 bridges are "structurally deficient" and over 20,000 more are "fracture critical," meaning that the failure of one component could lead to a collapse, as we saw in the I-5 bridge over the Skagit River in Washington state last spring. Join Daily Kos and the Democratic Senatorial Campaign Committee (DSCC) to tell Senate Republicans: Fix our roads and bridges instead of shutting down our government. AS THE GOP PREVENTS US FROM REBUILDING BADLY DECAYING INFRASTRUCTURE BECAUSE IT WILL CREATE JOBS AND IMPROVE THE ECONOMY, WHICH THEY DO NOT WANT, THEY HOPE THAT EVERY BRIDGE COLLAPSE WILL BE BLAMED ON OBAMA. YEAH.... THANKS OBAMA! HOW STUPID DO THEY THINK WE ARE? Tell Senate Republicans: Fix our roads and bridges instead of shutting down our government  Republicans are playing games with our government to appeal to their right-wing base—pushing us to the brink of a government shutdown in order to score a few points with the Tea Party. While they’re playing games of legislative chicken, our country is facing a major infrastructure crisis. The AP rep... 
"""

nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)
for i in range(len(doc)):
    j = i+1
    
    if j < len(doc):
      if ( (doc[i].pos_ == "ADJ" or doc[i].pos_ == "NOUN" or doc[i].pos_ == "PROPN") and (doc[j].pos_ == "ADJ" or doc[j].pos_ == "NOUN" or doc[j].pos_ == "PROPN") ):
        print(doc[i].text, doc[j].text, doc[i].pos_, doc[j].pos_)

Daily Kos PROPN PROPN
fracture critical NOUN ADJ
I-5 bridge PROPN NOUN
Skagit River PROPN PROPN
Washington state PROPN NOUN
state last NOUN ADJ
last spring ADJ NOUN
Daily Kos PROPN PROPN
Democratic Senatorial PROPN PROPN
Senatorial Campaign PROPN PROPN
Campaign Committee PROPN PROPN
Senate Republicans PROPN PROPN
REBUILDING BADLY NOUN NOUN
BADLY DECAYING NOUN NOUN
DECAYING INFRASTRUCTURE NOUN NOUN
BRIDGE COLLAPSE NOUN NOUN
Senate Republicans PROPN PROPN
wing base NOUN NOUN
government shutdown NOUN NOUN
few points ADJ NOUN
Tea Party PROPN PROPN
legislative chicken ADJ NOUN
major infrastructure ADJ NOUN
infrastructure crisis NOUN NOUN
AP rep PROPN PROPN


In [None]:
import spacy

sample = """
Donald Trump was the 45th President of the of America, 
a position he by corrupt treasonous means in collusion Russian interference in American election 
He for the Rule and replace nationalist dictatorship using of the white supremacist type.
"""


nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)
for i in range(len(doc)):
    j = i+1
    if j < len(doc):
        if ( (doc[i].pos_ == "ADJ" or doc[i].pos_ == "NOUN" or doc[i].pos_ == "PROPN") and (doc[j].pos_ == "ADJ" or doc[j].pos_ == "NOUN" or doc[j].pos_ == "PROPN") ):
            print(doc[i].text, doc[j].text,  doc[i].pos_, doc[j].pos_)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
# sample = """
# Join Daily Kos and the Democratic Senatorial Campaign Committee (DSCC) to tell Senate Republicans: 
# Fix our roads and bridges instead of shutting down our government. 
# AS THE GOP PREVENTS US FROM REBUILDING BADLY DECAYING INFRASTRUCTURE BECAUSE IT WILL CREATE JOBS AND IMPROVE THE ECONOMY, 
# WHICH THEY DO NOT WANT, THEY HOPE THAT EVERY BRIDGE COLLAPSE WILL BE BLAMED ON OBAMA. 
# YEAH.... THANKS OBAMA! HOW STUPID DO THEY THINK WE ARE? 
# Tell Senate Republicans: Fix our roads and bridges instead of shutting down our government  
# Republicans are playing games with our government to appeal to their right wing base—pushing us to the brink of a government shutdown in order to score a few points with the Tea Party. 
# While they’re playing games of legislative chicken, our country is facing a major infrastructure crisis. The AP rep... 
# """

sample = """
Donald Trump was the 45th President of the of America, 
a position he by corrupt treasonous means in collusion Russian interference in American election 
He for the Rule and replace nationalist dictatorship using of the white supremacist type.
"""

doc = nlp(sample)
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)


 SPACE _SP 
Donald PROPN NNP compound
Trump PROPN NNP nsubj
was AUX VBD ROOT
the DET DT det
45th ADJ JJ amod
President PROPN NNP attr
of ADP IN prep
the DET DT pobj
of ADP IN prep
America PROPN NNP pobj
, PUNCT , punct

 SPACE _SP 
a DET DT det
position NOUN NN attr
he PRON PRP intj
by ADP IN prep
corrupt ADJ JJ amod
treasonous ADJ JJ amod
means NOUN NNS pobj
in ADP IN prep
collusion NOUN NN nmod
Russian ADJ JJ amod
interference NOUN NN pobj
in ADP IN prep
American ADJ JJ amod
election NOUN NN pobj

 SPACE _SP 
He PRON PRP ROOT
for ADP IN prep
the DET DT det
Rule PROPN NNP pobj
and CCONJ CC cc
replace VERB VB conj
nationalist ADJ JJ amod
dictatorship NOUN NN dobj
using VERB VBG advcl
of ADP IN prep
the DET DT det
white ADJ JJ amod
supremacist ADJ JJ compound
type NOUN NN pobj
. PUNCT . punct

 SPACE _SP 


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = """
 For G.O.P., Incentives for Budget Deal With Obama Delaying steps to rein in Social Security, Medicare and Medicaid means delaying significant attempts to curb the size of the government.	
  """

doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
## pos tagging

import csv
import pandas as pd

for i in range(10,11):

  sample = dfs[i]['text'].values.tolist()
  
  text_list = []
  pos_list = []
  tag_list = []
  import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    for word in sent:
      if (word.pos_ == "ADJ"):
        text_list.append(word.text)
        pos_list.append(word.pos_)
        tag_list.append(word.tag_)
      if (word.pos_ == "NOUN"):
        text_list.append(word.text)
        pos_list.append(word.pos_)
        tag_list.append(word.tag_)      
      if (word.pos_ == "PROPN") :
        text_list.append(word.text)
        pos_list.append(word.pos_)
        tag_list.append(word.tag_)
      
  temp = pd.DataFrame({'text': text_list, 'pos': pos_list, 'tag': tag_list })

  temp['text_lower'] = temp['text'].str.lower()
  temp2  = temp.groupby(['text_lower']).size().reset_index(name='counts')
  temp2.to_csv("temp_dem_d"+str(i)+".csv")
