<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/4_5_22_4_Extracting_new_politifact_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import glob
import json
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base_dir = Path("/content/drive/MyDrive/fakenews/Research FakeNews")
assert base_dir.exists()

## Read data

In [None]:
%%capture
!unzip /content/drive/MyDrive/fakenews/Research\ FakeNews/Datasets\ Fakenews/fakenewsnet_dataset_politifact.zip

In [None]:
items = glob.glob(("/content/fakenewsnet_dataset/politifact/*/politifact*"), recursive=True)
len(items)

1056

In [None]:
def _process_content(path:Path) -> dict:
    filepath = path/"news content.json"
    if filepath.exists():
        with open(filepath) as f:
            news_content = json.load(f)
    else:
        news_content = dict()
    return news_content

def _process_tweet(filepath:Path) -> dict:
    with open(filepath) as f:
        tweet = json.load(f)
    keys = ["id", "text", "retweet_count"]
    tweet_info = {k:tweet.get(k, None) for k in keys}
    user_keys = ["id", 'location', "friends_count", "followers_count"]
    user = tweet.get("user", {})
    user_info = {f"user_{k}":user.get(k, None) for k in user_keys}
    return {**tweet_info, **user_info}

In [None]:
for e in items:
    tweets = list((Path(e)/"tweets").glob("*.json"))
    if len(tweets):
        break

In [None]:
tweets[0]

PosixPath('/content/fakenewsnet_dataset/politifact/fake/politifact14235/tweets/873326045058838528.json')

In [None]:
_process_tweet(tweets[2])

{'id': 871714089986396161,
 'retweet_count': 0,
 'text': 'Liberal Women Hate Me Because Of My "Striking Beauty And Intellect", Says Kellyanne Conway - London Web News https://t.co/YbYq9LThas',
 'user_followers_count': 709,
 'user_friends_count': 1888,
 'user_id': 16560700,
 'user_location': ''}

In [None]:
base_keys = ["id", "label"]
news_content_keys = ['url', 'text', 'top_img', 'keywords', 'authors', 'canonical_link', 'title', 'meta_data', 'publish_date', 'source', 'summary']
tweeter_keys = ["retweets", "tweets"]
row_list = []

for e in tqdm(items):
    row = {}
    path = Path(e)
    row["id"] = path.name
    row["label"] = path.parent.name
    # add news content
    news_content = _process_content(path)
    for k in news_content_keys:
        row[k] = news_content.get(k, None)
    # add tweeter data
    row["retweets"] = [x.name.split(".")[0] for x in (path/"retweets").glob("*.json")]
    tweets = [_process_tweet(x) for x in (path/"tweets").glob("*.json")]
    row["tweets"] = tweets

    row_list.append(row)
    

df = pd.DataFrame(row_list, columns=base_keys+news_content_keys+tweeter_keys)

  0%|          | 0/1056 [00:00<?, ?it/s]

In [None]:
df.head()

Unnamed: 0,id,label,url,text,top_img,keywords,authors,canonical_link,title,meta_data,publish_date,source,summary,retweets,tweets
0,politifact14667,fake,https://www.facebook.com/StopDjTrump/photos/a....,,https://static.xx.fbcdn.net/rsrc.php/yD/r/d4ZI...,[],[],,Facebook,"{'viewport': 'width=device-width,initial-scale...",,https://www.facebook.com,,"[1022546912858103808, 1020337613474009089, 105...",[]
1,politifact14235,fake,http://londonwebnews.com/2017/06/01/liberal-wo...,"Kellyanne Conway, counselor to President Trump...",http://londonwebnews.com/wp-content/uploads/20...,[],[],http://londonwebnews.com/2017/06/01/liberal-wo...,Liberal Women Hate Me Because Of My “Striking ...,"{'viewport': 'width=device-width, initial-scal...",1496268000.0,http://londonwebnews.com,,"[873326045058838528, 872593394324951042, 87171...","[{'id': 873326045058838528, 'text': 'Liberal W..."
2,politifact15037,fake,www.reddit.com/r/conspiracy/comments/7ww8j1/ru...,Wir verwenden Cookies auf unseren Websites für...,https://www.redditstatic.com/desktop2x/img/fav...,[],[],https://www.reddit.com/r/conspiracy/comments/7...,Russian source behind Trump dossier killed in ...,"{'viewport': 'width=device-width, initial-scal...",,http://www.reddit.com,,"[963508344026693633, 962788790338174976, 96412...",[]
3,politifact15534,fake,https://web.archive.org/web/20180630141610/htt...,"They’ve only started on Jupiter recently, henc...",https://web.archive.org/web/20180630141610im_/...,[],[],https://web.archive.org/web/20180630141610/htt...,NASA ANNOUNCED THAT IT COMMUNICATED WITH FOUR ...,"{'viewport': 'width=device-width, initial-scal...",1529786000.0,https://web.archive.org,,"[909138789754441730, 906251398718910464, 88211...",[]
4,politifact14119,fake,https://web.archive.org/web/20170521002626/htt...,"Fox News’ bright and shining primetime star, T...",https://web.archive.org/web/20170521002626im_/...,[],[],https://web.archive.org/web/20170521002626/htt...,BREAKING: Fox Star Tucker Carlson In Critical ...,"{'viewport': 'width=device-width, initial-scal...",1494678000.0,https://web.archive.org,,"[863760194244759552, 863421319752364032, 86376...",[]


In [None]:
def get_tid(a):
    if len(a) == 0:
        return
    return a[0]["id"]


df.tweets.map(get_tid).isna().all()

False

In [None]:
(df.label == 'fake').sum(), (df.label == 'real').sum()

(432, 624)

In [None]:
len(df[df.text.str.len()==0]) + df.text.isna().sum()

181

## Load data

In [None]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/"politifact.csv", index_col=0, converters=converters, parse_dates=["publish_date"])

In [None]:
df.head()

Unnamed: 0,id,label,url,text,top_img,keywords,authors,canonical_link,title,meta_data,publish_date,source,summary,retweets,tweets
0,politifact1212,real,http://www.youtube.com/watch?v=5zrsl8o4ZPo&fea...,ein Google-Unternehmen\n\nDienste anbieten und...,http://www.google.com/favicon.ico,[],[],,Bevor Sie zu YouTube weitergehen,"{'viewport': 'initial-scale=1, maximum-scale=5...",,http://www.youtube.com,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
1,politifact6730,real,http://www.motherjones.com/politics/2012/09/se...,"During a private fundraiser earlier this year,...",https://www.motherjones.com/wp-content/uploads...,[],"['David Corn', 'Dave Gilson', 'Tim Murphy', 'B...",https://www.motherjones.com/politics/2012/09/s...,SECRET VIDEO: Romney Tells Millionaire Donors ...,"{'viewport': 'width=device-width, initial-scal...",1347904858.0,http://www.motherjones.com,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
2,politifact2298,real,https://web.archive.org/web/20050322064340/htt...,"COPYRIGHT © 2005 LexisNexis, a division of Ree...",,[],[],,LexisNexis(R) Publisher,{},,https://web.archive.org,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
3,politifact87,real,http://www.ilga.gov/legislation/BillStatus.asp...,×\n\nThe Illinois General Assembly offers the ...,http://www.ilga.gov/LISlogo1.ico,[],[],,Illinois General Assembly,"{'classification': 'Government', 'distribution...",,http://www.ilga.gov,,[],[]
4,politifact3180,real,http://abcnews.go.com/Politics/rand-paul-repub...,"Feb. 4, 2011  -- In an exclusive interview wi...",http://abcnews.go.com/Politics/rand-paul-repub...,[],"['Abc News', 'Jonathan Karl', 'February']",https://abcnews.go.com/Politics/rand-paul-repu...,ABC News Exclusive: Rand Paul Says Republicans...,{'description': 'In an exclusive interview wit...,,http://abcnews.go.com,,[],[]


In [None]:
num_retweets = df.retweets.map(len)
num_retweets.min(), num_retweets.mean(), num_retweets.max()

(0, 59.75473484848485, 21984)

In [None]:
num_tweets = df.tweets.map(len)
num_tweets.min(), num_tweets.mean(), num_tweets.max()

(0, 552.5719696969697, 29060)

## Add data for entries missing `news_content`

In [None]:
list(base_dir.glob("*.csv"))

[PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/politifact_fake.csv'),
 PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/churn_data.csv'),
 PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/gossipcop_real.csv'),
 PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/politifact_real.csv'),
 PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/gossipcop_fake.csv'),
 PosixPath('/content/drive/MyDrive/fakenews/Research FakeNews/politifact.csv')]

In [None]:
df_fake = pd.read_csv(base_dir/'politifact_fake.csv')
df_real = pd.read_csv(base_dir/'politifact_real.csv')
df_fake['label']=0
df_real['label']=1
class_names = ["Fake", "Real"]

df_old = df_fake.append(df_real)
df_old.reset_index(drop=True, inplace=True)

In [None]:
len(df_old.id.unique()), len(df)

(1054, 1056)

Two ids appear in both fake and real. Exclude?

In [None]:
dup_ids, dup_groups = [], []
for n, g in df.groupby("id"):
    if len(g) > 1:
        dup_ids.append(n)
        dup_groups.append(g)
print(len(dup_ids))

2


In [None]:
dup_ids

['politifact14920', 'politifact14940']

In [None]:
dup_groups[0]

Unnamed: 0,id,label,url,text,top_img,keywords,authors,canonical_link,title,meta_data,publish_date,source,summary,retweets,tweets
213,politifact14920,fake,https://www.ecfr.gov/cgi-bin/text-idx?gp=&SID=...,Home\n\ngpo.gov\n\ngovinfo.gov\n\ne-CFR Naviga...,,[],[],https://www.ecfr.gov/,Electronic Code of Federal Regulations (eCFR),"{'content-type': 'text/html; charset=utf-8', '...",,https://www.ecfr.gov,,"[759182777761890304, 942617526986559488, 73000...",[]
741,politifact14920,real,https://www.ecfr.gov/cgi-bin/text-idx?gp=&SID=...,Home\n\ngpo.gov\n\ngovinfo.gov\n\ne-CFR Naviga...,,[],[],https://www.ecfr.gov/,Electronic Code of Federal Regulations (eCFR),"{'content-type': 'text/html; charset=utf-8', '...",,https://www.ecfr.gov,,"[1053921270297841665, 1060573426438553600, 105...",[]


In [None]:
dup_groups[1]

Unnamed: 0,id,label,url,text,top_img,keywords,authors,canonical_link,title,meta_data,publish_date,source,summary,retweets,tweets
35,politifact14940,fake,https://www.politico.com/story/2018/01/17/full...,Full text: Jeff Flake on Trump speech transcri...,https://cf-images.us-east-1.prod.boltdns.net/v...,[],"[Louis Nelson, Darren Samuelsohn, Politico Staff]",https://www.politico.com/story/2018/01/17/full...,Full text: Jeff Flake on Trump speech transcript,"{'article': {'opinion': 'false'}, 'og': {'titl...",1516144000.0,https://www.politico.com,,"[922998131758739457, 923200199539019776, 92294...","[{'id': 922998131758739457, 'text': 'Transcrip..."
478,politifact14940,real,https://www.politico.com/story/2018/01/17/full...,Full text: Jeff Flake on Trump speech transcri...,https://cf-images.us-east-1.prod.boltdns.net/v...,[],"[Louis Nelson, Darren Samuelsohn, Politico Staff]",https://www.politico.com/story/2018/01/17/full...,Full text: Jeff Flake on Trump speech transcript,"{'article': {'opinion': 'false'}, 'og': {'titl...",1516144000.0,https://www.politico.com,,"[922998131758739457, 923200199539019776, 92294...",[]


In [None]:
df = df[df.id.map(lambda x: x not in dup_ids)]
df.reset_index(inplace=True)
df.shape

(1052, 15)

In [None]:
id2title = df_old[["id", "title"]].groupby("id").first().to_dict()["title"]

In [None]:
df.title.fillna("", inplace=True)
(df.title=="").sum()

161

In [None]:
for i, row in df.iterrows():
    if row.title == "":
        df.loc[i, "title"] = id2title.get(row.id, "")

In [None]:
(df.title=="").sum()

0

Is there a way to get content for those ids?

### Get texts

In [None]:
%%capture
!pip install -U selenium
!apt-get update && apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)

In [None]:
df_old.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0


In [None]:
id2url = df_old[["id", "news_url"]].groupby("id").first().to_dict()["news_url"]

In [None]:
for i, row in df[df.url.isna()].iterrows():
    df.loc[i, "url"] = id2url.get(row.id, "")

In [None]:
df_old.news_url.isna().sum()

61

In [None]:
(df.url.isna()).sum()

61

In [None]:
urls = df[df.text.isna()].url.tolist()

In [None]:
new_texts = dict()
for i, row in df[df.text.isna()].iterrows():
    url = row.url
    if (url is not None) and ("archive.org" in url) and (not url.endswith(".pdf")):
        print(url)
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        _a = soup.find_all("div", {"id":"articleWrap"})
        text = None
        if len(_a):
            article = _a[0]
            paragraphs = article.find_all("p")
            text = "\n".join(p.text for p in paragraphs if len(p.text.strip()))
            text = " ".join(text.split())
        elif len(soup.find_all("meta", {"name":"DESCRIPTION"})):
            meta = soup.find_all("meta", {"name":"DESCRIPTION"})[0]
            text = meta.get("content", None)
            
        if text:
            new_texts[i] = text

https://web.archive.org/web/20070518064337/http://www.time.com:80/time/magazine/article/0,9171,810925,00.html
https://web.archive.org/web/20070212105712/http://www.bea.gov:80/national/xls/gdpchg.xls
https://web.archive.org/web/20051124053404/http://www.time.com:80/time/magazine/article/0,9171,1129494,00.html
https://web.archive.org/web/20100202204116/http://alexander.senate.gov/public/index.cfm?p=NewsArticles


In [None]:
len(new_texts)

2

In [None]:
for t in new_texts.values():
    print(t[:100])

(See Cover) George Wilcken Romney, at 51, is a broad-shouldered, Bible-quoting broth of a man who bu
When it comes to raw political
talent, there's not a Bill Clinton in this group. But these are the
r


In [None]:
for i, t in new_texts.items():
    df.loc[i, "text"] = " ".join(text.split())

In [None]:
df.to_csv(base_dir/"politifact.csv")

## Data cleaning

In [None]:
df = df[df.title.str.len() > 0]

dups = []
for name, group in df.groupby("title"):
    if len(group) > 1:
        dups.append(group)

In [None]:
df = df.groupby("title", as_index=False).first()

texts = df.title + " " + df.text

In [None]:
len(texts)

741

In [None]:
texts

0      "Face the Nation" transcripts, August 26, 2012...
1      'Discriminatory animus': Trump sued on DACA Ne...
2      'This Week' Transcript: Adm. Mike Mullen Novem...
3      'This Week' Transcript: Biden July 11, 2010  ...
4      'This Week' Transcript: Former Vice President ...
                             ...                        
736    ‘NASA Confirms’ 15 Days Of Darkness Coming Thi...
737    ‘Smallville’ Star Confesses She Sold Children ...
738    ‘The Daily Show’s’ Michelle Wolf Lands Her Own...
739    “Dictionary” on President Obama’s Health Care ...
740    “Gay People Should Wear Specially-Colored Clot...
Length: 741, dtype: object

## Source

In [None]:
source = df.news_url.str.replace("^(https://)?web.archive.org/web/\d+/", "", regex=True)

In [None]:
source = source.str.replace("^(https?://)?(www.)?", "", regex=True)

In [None]:
source = source.str.replace("\.(com|info|org|gov|tv|us|news|me|co.uk|net|club|co|live|edu|xyz|site|life|ru|online|tk|website|pw|one|world|mil).*$", "", regex=True)

In [None]:
len(source), len(source.unique())

(1056, 526)

## Number of retweets

In [None]:
df["num_retweets"] = df.retweets.map(len)
df["log_num_retweets"] = np.log1p(df.num_retweets.to_numpy())
df["num_tweets"] = df.tweets.map(len)
df["log_num_tweets"] = np.log1p(df.num_tweets.to_numpy())

In [None]:
df.describe()

Unnamed: 0,summary,num_retweets,log_num_retweets,num_tweets,log_num_tweets
count,0.0,741.0,741.0,741.0,741.0
mean,,72.734143,0.810143,464.350877,3.376457
std,,847.921827,1.829153,2057.171092,2.580387
min,,0.0,0.0,0.0,0.0
25%,,0.0,0.0,1.0,0.693147
50%,,0.0,0.0,34.0,3.555348
75%,,0.0,0.0,220.0,5.398163
max,,21984.0,9.998116,29060.0,10.277152


## More to come