In [8]:
import os
import pandas as pd
import json
import numpy as np

from dotenv import load_dotenv
load_dotenv()

True

In [9]:
import tweepy
import traceback

In [19]:
import spacy
import torch

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from spacymoji import Emoji

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from transformers import AutoModel, BertTokenizerFast

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

In [3]:
def get_src_text(row):
    path = "data/en/" + row['event'] + "/" + str(row['threadid']) + "/source-tweets/" + str(row['tweetid']) + ".json"
    with open(path, "r") as f:
        source = json.loads(f.read())
        return source['text']

def is_true(row):
    print(row)
    path = "data/en/" + row['event'] + "/" + str(row['threadid']) + "/annotation.json"
    with open(path, "r") as f:
        source = json.loads(f.read())
        return str(source.get('true', 'unverified'))

In [4]:
# Dataset used: https://figshare.com/articles/dataset/PHEME_rumour_scheme_dataset_journalism_use_case/2068650
df = pd.read_json("data/en-scheme-annotations.json", dtype = {"threadid": str, "tweetid": str}, lines=True)

df['true'] = df.apply(lambda row: is_true(row), axis=1)
df['src_text'] = df.apply(lambda row: get_src_text(row), axis=1)

print(df.head())

event                  putinmissing
threadid         577258317942149120
tweetid          577258317942149120
support                  supporting
evidentiality             url-given
certainty          somewhat-certain
Name: 0, dtype: object


FileNotFoundError: [Errno 2] No such file or directory: 'data/en/putinmissing/577258317942149120/annotation.json'

In [17]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['src_text'], df['true'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3)


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

train_frame = pd.DataFrame([train_text, train_labels])
val_frame = pd.DataFrame([val_text, val_labels])
test_frame = pd.DataFrame([test_text, test_labels])

In [6]:
print(val_labels.value_counts())
print(train_text)

0             22
1             18
unverified     5
Name: true, dtype: int64
108    PRINCE IN TORONTO TONIGHT:\n@3RDEYEGIRL tweete...
85     Up to 20 held hostage in Sydney Lindt Cafe sie...
170    Five hostages have escaped the besieged Lindt ...
76     #sydneysiege is over. 2 confirmed dead, #PrayF...
286    BREAKING: Three hostages appear to have escape...
                             ...                        
156    BREAKING  @SkyBusiness: freed hostage borne hi...
265    DETAILS: The hostage site is Lindt Chocolat Ca...
226    Currently the #FoxNews website has zero, repea...
102    Police convoy and helicopters are rushing to s...
250    Local media: 3 people appear to escape from Ma...
Name: src_text, Length: 207, dtype: object


In [53]:
nlp_spacymoji = spacy.load("en_core_web_sm")
emoji = Emoji(nlp_spacymoji, merge_spans=True)
nlp_spacymoji.add_pipe('emoji', first=True)
# tokenised_train = train_frame.apply(lambda row: nlp_spacymoji(row['src_text']))
# #tok_train = nlp_spacymoji(train_text)
# print(tokenised_train)

def spacy_tokeniser(text):
    tokens = []
    for w in nlp_spacymoji(text):
        tokens.append(w.lemma_.lower())
    return tokens

In [57]:
# Get features for TF-IDF
tfidf = TfidfVectorizer()#tokenizer=spacy_tokeniser)
tfidf.fit(train_text)
train_features = tfidf.transform(train_text)
validation_features = tfidf.transform(val_text)
test_features = tfidf.transform(test_text)

In [58]:
print(train_features)

  (0, 1199)	0.24445295777678855
  (0, 1130)	0.2633723219567455
  (0, 1117)	0.3973731771501845
  (0, 1114)	0.212110091085962
  (0, 1110)	0.10389814156046775
  (0, 1094)	0.1838272529799555
  (0, 959)	0.2633723219567455
  (0, 950)	0.20491728264962708
  (0, 864)	0.19319072690600506
  (0, 799)	0.2633723219567455
  (0, 787)	0.12850499352435185
  (0, 776)	0.2633723219567455
  (0, 725)	0.22061737780894516
  (0, 584)	0.19319072690600506
  (0, 577)	0.13701228024733503
  (0, 555)	0.08130279123838846
  (0, 207)	0.2633723219567455
  (0, 156)	0.2633723219567455
  (0, 26)	0.2633723219567455
  (1, 1144)	0.25764378654454123
  (1, 1110)	0.15511401535306543
  (1, 1064)	0.18994197665972787
  (1, 995)	0.26280630051960463
  (1, 882)	0.39319989537825556
  (1, 643)	0.28842276607929185
  :	:
  (205, 507)	0.22431768606672714
  (205, 343)	0.297348050812346
  (205, 288)	0.297348050812346
  (205, 262)	0.14337508668613588
  (205, 237)	0.22431768606672714
  (205, 121)	0.17604724059817461
  (205, 110)	0.1583217914753

In [38]:
def evaluation_summary(description, true_labels, predictions):
  print("Evaluation for: " + description)
  print(classification_report(true_labels, predictions,  digits=3, zero_division=0))
  print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))

In [56]:
# SVC
svc = SVC(kernel='rbf')
svc_model = svc.fit(train_features, train_labels)
print(svc_model.score(validation_features, val_labels))
svc_predicted_labels = svc_model.predict(validation_features)

evaluation_summary("SVC", val_labels, svc_predicted_labels)
svc_test = svc_model.predict(test_features)
evaluation_summary("SVC test", test_labels, svc_test)

# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression()
lr_model_tfidf = lr_tfidf.fit(train_features, train_labels)
print(lr_model_tfidf.score(validation_features, val_labels))
print(lr_model_tfidf.score(test_features, test_labels))

lr_predicted_labels_tfidf = lr_model_tfidf.predict(validation_features)
evaluation_summary("LR (TF-IDF)", val_labels, lr_predicted_labels_tfidf)
lrtfidf_test = lr_model_tfidf.predict(test_features)
evaluation_summary("LR (TF-IDF) test", test_labels, lrtfidf_test)

# Dummy Majority
dumb = DummyClassifier(strategy='most_frequent')
dumb.fit(train_features, train_labels)
print(dumb.score(validation_features, val_labels))
dumb_validation_predicted_labels = dumb.predict(validation_features)
evaluation_summary("Dummy majority", val_labels, dumb_validation_predicted_labels)
dumb_test = dumb.predict(test_features)
evaluation_summary("Dummy MF test", test_labels, dumb_test)

0.6222222222222222
Evaluation for: SVC
              precision    recall  f1-score   support

           0      0.737     0.636     0.683        22
           1      0.538     0.778     0.636        18
  unverified      0.000     0.000     0.000         5

    accuracy                          0.622        45
   macro avg      0.425     0.471     0.440        45
weighted avg      0.576     0.622     0.588        45


Confusion matrix:
 [[14  8  0]
 [ 4 14  0]
 [ 1  4  0]]
Evaluation for: SVC test
              precision    recall  f1-score   support

           0      0.833     0.455     0.588        22
           1      0.500     0.889     0.640        18
  unverified      1.000     0.200     0.333         5

    accuracy                          0.600        45
   macro avg      0.778     0.514     0.521        45
weighted avg      0.719     0.600     0.581        45


Confusion matrix:
 [[10 12  0]
 [ 2 16  0]
 [ 0  4  1]]
0.6222222222222222
0.6
Evaluation for: LR (TF-IDF)
         

In [21]:
def get_src_text_by_index(row):
    return df.iloc[row.index]['src_text']

accuracy = pd.DataFrame([test_labels, lrtfidf_test])
print(accuracy)
print(lrtfidf_test)
print(test_labels)
print(type(test_labels))
test_df = pd.DataFrame(test_labels)
test_df['predicted'] = lrtfidf_test
print(test_df.apply(lambda row: get_src_text_by_index(row)))
print(test_df)
#test_df = pd.DataFrame({"tweet_id":test_labels[0], "actual_label": test_labels[1]})
test_labels.add(lrtfidf_test)
print(test_labels)

           127 18   143  161 32   131  177         19          29   172  ...  \
true         1   1    1    1   0    0    0           0  unverified    0  ...   
Unnamed 0  NaN   0  NaN  NaN   0  NaN  NaN  unverified           1  NaN  ...   

           288  202  162 5    58   69  17          210  223  54   
true         1    0    1   1    1    0   0  unverified    0    0  
Unnamed 0  NaN  NaN  NaN   1  NaN  NaN   0         NaN  NaN  NaN  

[2 rows x 45 columns]
['1' '0' '0' '1' '0' '1' '1' '0' '1' '1' '1' '1' '0' '0' '0' '1' '0' '0'
 '0' 'unverified' '1' '1' '0' '0' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0'
 '1' '1' '0' '1' '1' '1' '0' '1' '1' '0' '1']
127             1
18              1
143             1
161             1
32              0
131             0
177             0
19              0
29     unverified
172             0
92              1
51              1
215             0
7               1
206             0
285             1
269             1
118             0
36              0


In [22]:
# Get output

input_text = "Watch video showing gunfire inside Canada's pa"
input_df = pd.DataFrame({"src_text": [input_text]})
input_features = tfidf.transform(input_df)

predicted_label = lr_model_tfidf.predict(input_features)

print(predicted_label)

['0']


In [26]:
# New dataset (FakeNewsNet)
fake_df = pd.read_csv("data/FakeNewsNet/fake.csv")
fake_df['true'] = False
real_df = pd.read_csv("data/FakeNewsNet/real.csv")
real_df['true'] = True

df = pd.concat([fake_df, real_df])
print(len(df))
df.head()

225940


Unnamed: 0.1,Unnamed: 0,id,text,author_id,source,created_at,edit_history_tweet_ids,lang,description,username,verified,name,location,true
0,0,937349434668498944,BREAKING: First NFL Team Declares Bankruptcy O...,4219197432,Twitter Web Client,2017-12-03T15:54:54.000Z,['937349434668498944'],en,Ofelia. Arizmendez @ deplorable me. I am a pro...,OfeliasHeaven,False,Ofelia Duchess Arizmendez,"Sugar Land, TX",False
1,1,937379378006282240,BREAKING: First NFL Team Declares Bankruptcy O...,3018973429,Facebook,2017-12-03T17:53:54.000Z,['937379378006282240'],en,,lorn_cramer,False,Lorn Cramer,,False
2,2,937380068590055425,BREAKING: First NFL Team Declares Bankruptcy O...,3018973429,Facebook,2017-12-03T17:56:38.000Z,['937380068590055425'],en,,lorn_cramer,False,Lorn Cramer,,False
3,3,937429898670600192,BREAKING: First NFL Team Declares Bankruptcy O...,23162382,Twitter for iPhone,2017-12-03T21:14:39.000Z,['937429898670600192'],en,Happily married conservative Pentecostal woman...,starchaser57,False,🌹Star Chaser 🌼🇺🇸,Ozarks. Missouri,False
4,4,937449906352152576,BREAKING: First NFL Team Declares Bankruptcy O...,1409084934,Twitter Web Client,2017-12-03T22:34:09.000Z,['937449906352152576'],en,Deplorable member of The Silenced Majority.. #...,ThePipeStore,False,Mr. Walker 🇺🇸,,False


In [None]:
# BERT

In [20]:
# https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=URn-DWJt5xhP
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

import transformers as ppb
model_class, tokenizer_class, bert_model_name = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(bert_model_name)
model = model_class.from_pretrained(bert_model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

In [27]:
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [28]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [29]:
input_ids = torch.tensor(np.array(padded))

with torch.no_grad():
    last_hidden_states = model(input_ids) # ERROR: Tries to get 255 GB of RAM

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 255424266240 bytes.

In [9]:
features = last_hidden_states[0][:,0,:].numpy()

In [10]:
labels = df['true']
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [11]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [12]:
lr_clf.score(test_features, test_labels)

0.5866666666666667

In [None]:
# Cleaning dataset

In [14]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from collections import Counter

ps = PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\karli\AppData\Roaming\nltk_data...


In [24]:
remove_col = ['threadid','tweetid','support', 'evidentiality', 'certainty', 'event']

# Impute null values with None
def null_process(feature_df):
    col = 'src_text'
    feature_df.loc[feature_df[col].isnull(), col] = "None"
    return feature_df

# Removed unused clumns
def remove_unused_col(df,column_n=remove_col):
    df = df.drop(column_n,axis=1)
    return df

def clean_dataset(df):
    # remove unused column
    df = remove_unused_col(df)
    #impute null values
    df = null_process(df)
    return df

# Cleaning text from unused characters
def clean_text(text):
    text = str(text).replace(r'http[\w:/\.]+', ' ')  # removing urls
    text = str(text).replace(r'[^\.\w\s]', ' ')  # remove everything but characters and punctuation
    text = str(text).replace('[^a-zA-Z]', ' ')
    text = str(text).replace(r'\s\s+', ' ')
    text = text.lower().strip()
    #text = ' '.join(text)    
    return text

## Nltk Preprocessing includes:
# Stop words, Stemming and Lemmetization
def nltk_preprocess(text):
    text = clean_text(text)
    wordlist = re.sub(r'[^\w\s]', '', text).split()
    text = ' '.join([word for word in wordlist if word not in stopwords_dict])
    text = [ps.stem(word) for word in wordlist if not word in stopwords_dict]
    text = ' '.join([wnl.lemmatize(word) for word in wordlist if word not in stopwords_dict])
    return  text

In [25]:
df = clean_dataset(df)
df["src_text"] = df.src_text.apply(nltk_preprocess)
df.head()

Unnamed: 0,true,src_text
0,unverified,vladimir putin netralized internal coup maybe ...
1,unverified,coup rt jimgeraghty rumor russian military att...
2,unverified,hoppla l0gg0l swiss rumor putin absence due gi...
3,unverified,putin reappears tv amid claim unwell threat co...
4,1,france 10 people dead shooting hq satirical we...


In [None]:
# New dataset - FakeNewsNet

In [3]:
politifact_fake_df = pd.read_csv("data/FakeNewsNet-master/dataset/politifact_fake.csv")

pf_fake_tweet_ids = politifact_fake_df['tweet_ids'].str.split('\t').explode().tolist()
print(len(pf_fake_tweet_ids))
print()

politifact_real_df = pd.read_csv("data/FakeNewsNet-master/dataset/politifact_real.csv")
pf_real_tweet_ids = politifact_real_df['tweet_ids'].str.split('\t').explode().tolist()
print(len(pf_real_tweet_ids))

# def concat_tweet_ids(row):
#     global pf_fake_tweet_ids
#     print('tweet_ids' in row)
#     if 'tweet_ids' in row and row['tweet_ids'] != "":
#         pf_fake_tweet_ids = pf_fake_tweet_ids + row['tweet_ids'].split("\t")
    
# politifact_fake_df.apply(lambda x: concat_tweet_ids(x))
# print(pf_fake_tweet_ids)

fake_short = pf_fake_tweet_ids[:20]

print(fake_short)

165392

418379
['937349434668498944', '937379378006282240', '937380068590055425', '937384406511005696', '937387493451862016', '937400766024896512', '937406789686980608', '937411332240011266', '937415066810503168', '937427631661768704', '937429898670600192', '937436145004302337', '937438119468699648', '937449906352152576', '937450317142286336', '937451599320027136', '937452013939494912', '937452151227510784', '937453119478423553', '937462176293437443']


In [4]:
api_key = os.environ.get('APIKey')
api_key_secret = os.environ.get('APIKeySecret')
access_token = os.environ.get('AccessToken')
access_token_secret = os.environ.get('AccessTokenSecret')
bearer_token = os.environ.get('BearerToken')

In [None]:
# https://stackoverflow.com/questions/72505232/feature-extraction-with-tweet-ids
api = tweepy.Client(consumer_key=api_key, 
                       consumer_secret=api_key_secret,
                       access_token=access_token, 
                       access_token_secret=access_token_secret,
                       bearer_token=bearer_token, 
                       wait_on_rate_limit=True,
                        )

auth = tweepy.OAuth1UserHandler(
   api_key, api_key_secret, access_token, access_token_secret
)

api = tweepy.API(auth)

## --------------------------------------------------- IMPORTANT! 
IDs = fake_short

tweets_df = pd.DataFrame()

print("Total tweets to fetch:", len(IDs))

for counter, tweet_id in enumerate(IDs):
    try:
        if counter % 50 == 0:
            print("Tweets analysed so far:", counter)
            
        info_tweet = api.get_status(tweet_id, tweet_mode="extended")
        
        row = pd.DataFrame({'ID': info_tweet.id,
                             'Text': info_tweet.full_text,
                             'Created at': info_tweet.created_at,
                             'User location': info_tweet.user.location,
                             'Num Followers': info_tweet.user.followers_count,
                             'Num Friends': info_tweet.user.friends_count,
                             'Num Favourites': info_tweet.user.favourites_count,
                             'User description': info_tweet.user.description,
                             'User verified': info_tweet.user.verified,
                             'Language': info_tweet.lang}, index=[0])
        
        tweets_df = pd.concat([tweets_df, row])
        tweets_df = tweets_df.reset_index(drop=True)
    except:
        pass

print("Total successfuly fetched tweets:", len(tweets_df))

tweets_df.to_csv("fake_short.csv")

In [5]:
# Experiments with Tweet IDs
print(len(pf_fake_tweet_ids))
print(np.nan in pf_fake_tweet_ids)
pf_fake_tweet_ids = [x for x in pf_fake_tweet_ids if x is not np.nan]
print(len(pf_fake_tweet_ids))
print(np.nan in pf_fake_tweet_ids)

165392
True
165352
False


In [6]:
print(len(pf_real_tweet_ids))
print(np.nan in pf_real_tweet_ids)
pf_real_tweet_ids = [x for x in pf_real_tweet_ids if x is not np.nan]
print(len(pf_real_tweet_ids))
print(np.nan in pf_real_tweet_ids)

418379
True
418164
False


In [7]:
#print("type of response:", type(response))
# print(response)
# print(len(response.includes))
#print(response)
#print(response.includes['users'])
# counter = 0
# last_author_id = 0
# for tweet in response.data:
#     # "user.location", "user.followers_count", "user.friends_count",
#     #                          "user.favourites_count", "user.description", "user.verified"
#     #print(tweet)
#     if last_author_id == tweet.author_id:
#         counter -= 1
#     print(tweet.id, tweet.created_at, tweet.author_id)
#     print(response.includes['users'][counter].id)
#     last_author_id = tweet.author_id
#     counter += 1

import requests

client = tweepy.Client(bearer_token,
                       return_type = requests.Response,
                       wait_on_rate_limit=True)



# ------------------------------------------- IMPORTANT!! (Tweet IDs)
print("Fake news!")
IDs = pf_fake_tweet_ids
n = len(IDs)
# tweets_dict = {}
for batch_ix in range(0, n, 100):
    if batch_ix % 500 == 0:
        print("Tweets analysed so far:", batch_ix)
    
    response = client.get_tweets(IDs[batch_ix : min(batch_ix + 100, n)], 
                                 tweet_fields=
                                     ["created_at", "text", "lang", "author_id", "source"],
                                 user_fields=
                                     ["location", "description", "verified"],
                                 expansions="author_id")
    # https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
    #     tweets_dict = tweets_dict | response.json()
    #tweets_dict.append(response.json)

    # https://www.kirenz.com/post/2021-12-10-twitter-api-v2-tweepy-and-pandas-in-python/twitter-api-v2-tweepy-and-pandas-in-python/
    # Save data as dictionary
    tweets_dict = response.json() 

    try:
        # Extract "data" value from dictionary
        tweets_data = tweets_dict['data'] # can fail if no data in response
        user_data = tweets_dict['includes']['users']

        # Transform to pandas Dataframe
        df_tweets = pd.json_normalize(tweets_data)
        df_tweets.head()

        # Transform to pandas Dataframe
        df_users = pd.json_normalize(user_data)
        df_users = df_users.rename(columns={'id': 'author_id'})
        df_users.head()

        df = pd.merge(df_tweets, df_users, on=["author_id"])

        # Standardise order of columns
        df = df[['id', 'text', 'author_id', 'source', 'created_at', 'edit_history_tweet_ids', 'lang', 'description', 'username', 'verified', 'name', 'location']]

        include_header = batch_ix == 0
        df.to_csv("fake.csv", mode='a', header=include_header)
    except:
        continue
    
print("------------------------------------------------")
print("Real news!")
IDs = pf_real_tweet_ids
n = len(IDs)
# tweets_dict = {}
for batch_ix in range(0, n, 100):
    if batch_ix % 500 == 0:
        print("Tweets analysed so far:", batch_ix)
    
    response = client.get_tweets(IDs[batch_ix : min(batch_ix + 100, n)], 
                                 tweet_fields=
                                     ["created_at", "text", "lang", "author_id", "source"],
                                 user_fields=
                                     ["location", "description", "verified"],
                                 expansions="author_id")
    # https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
    #     tweets_dict = tweets_dict | response.json()
    #tweets_dict.append(response.json)

    # https://www.kirenz.com/post/2021-12-10-twitter-api-v2-tweepy-and-pandas-in-python/twitter-api-v2-tweepy-and-pandas-in-python/
    # Save data as dictionary
    tweets_dict = response.json() 

    # Extract "data" value from dictionary
    try:
        tweets_data = tweets_dict['data'] # can fail if no data in response
        user_data = tweets_dict['includes']['users']

        # Transform to pandas Dataframe
        df_tweets = pd.json_normalize(tweets_data)
        df_tweets.head()

        # Transform to pandas Dataframe
        df_users = pd.json_normalize(user_data)
        df_users = df_users.rename(columns={'id': 'author_id'})
        df_users.head()

        df = pd.merge(df_tweets, df_users, on=["author_id"])

        # Standardise order of columns
        df = df[['id', 'text', 'author_id', 'source', 'created_at', 'edit_history_tweet_ids', 'lang', 'description', 'username', 'verified', 'name', 'location']]

        include_header = batch_ix == 0
        df.to_csv("real.csv", mode='a', header=include_header)
    except:
        continue

Fake news!
Tweets analysed so far: 0


Rate limit exceeded. Sleeping for 542 seconds.


Tweets analysed so far: 500
Tweets analysed so far: 1000
Tweets analysed so far: 1500
Tweets analysed so far: 2000
Tweets analysed so far: 2500
Tweets analysed so far: 3000
Tweets analysed so far: 3500
Tweets analysed so far: 4000
Tweets analysed so far: 4500
Tweets analysed so far: 5000
Tweets analysed so far: 5500
Tweets analysed so far: 6000
Tweets analysed so far: 6500
Tweets analysed so far: 7000
Tweets analysed so far: 7500
Tweets analysed so far: 8000
Tweets analysed so far: 8500
Tweets analysed so far: 9000
Tweets analysed so far: 9500
Tweets analysed so far: 10000
Tweets analysed so far: 10500
Tweets analysed so far: 11000
Tweets analysed so far: 11500
Tweets analysed so far: 12000
Tweets analysed so far: 12500
Tweets analysed so far: 13000
Tweets analysed so far: 13500
Tweets analysed so far: 14000
Tweets analysed so far: 14500
Tweets analysed so far: 15000
Tweets analysed so far: 15500
Tweets analysed so far: 16000
Tweets analysed so far: 16500
Tweets analysed so far: 17000


Rate limit exceeded. Sleeping for 762 seconds.


Tweets analysed so far: 30000
Tweets analysed so far: 30500
Tweets analysed so far: 31000
Tweets analysed so far: 31500
Tweets analysed so far: 32000
Tweets analysed so far: 32500
Tweets analysed so far: 33000
Tweets analysed so far: 33500
Tweets analysed so far: 34000
Tweets analysed so far: 34500
Tweets analysed so far: 35000
Tweets analysed so far: 35500
Tweets analysed so far: 36000
Tweets analysed so far: 36500
Tweets analysed so far: 37000
Tweets analysed so far: 37500
Tweets analysed so far: 38000
Tweets analysed so far: 38500
Tweets analysed so far: 39000
Tweets analysed so far: 39500
Tweets analysed so far: 40000
Tweets analysed so far: 40500
Tweets analysed so far: 41000
Tweets analysed so far: 41500
Tweets analysed so far: 42000
Tweets analysed so far: 42500
Tweets analysed so far: 43000
Tweets analysed so far: 43500
Tweets analysed so far: 44000
Tweets analysed so far: 44500
Tweets analysed so far: 45000
Tweets analysed so far: 45500
Tweets analysed so far: 46000
Tweets ana

Rate limit exceeded. Sleeping for 721 seconds.


Tweets analysed so far: 60000
Tweets analysed so far: 60500
Tweets analysed so far: 61000
Tweets analysed so far: 61500
Tweets analysed so far: 62000
Tweets analysed so far: 62500
Tweets analysed so far: 63000
Tweets analysed so far: 63500
Tweets analysed so far: 64000
Tweets analysed so far: 64500
Tweets analysed so far: 65000
Tweets analysed so far: 65500
Tweets analysed so far: 66000
Tweets analysed so far: 66500
Tweets analysed so far: 67000
Tweets analysed so far: 67500
Tweets analysed so far: 68000
Tweets analysed so far: 68500
Tweets analysed so far: 69000
Tweets analysed so far: 69500
Tweets analysed so far: 70000
Tweets analysed so far: 70500
Tweets analysed so far: 71000
Tweets analysed so far: 71500
Tweets analysed so far: 72000
Tweets analysed so far: 72500
Tweets analysed so far: 73000
Tweets analysed so far: 73500
Tweets analysed so far: 74000
Tweets analysed so far: 74500
Tweets analysed so far: 75000
Tweets analysed so far: 75500
Tweets analysed so far: 76000
Tweets ana

Rate limit exceeded. Sleeping for 107 seconds.


Tweets analysed so far: 90500
Tweets analysed so far: 91000
Tweets analysed so far: 91500
Tweets analysed so far: 92000
Tweets analysed so far: 92500
Tweets analysed so far: 93000
Tweets analysed so far: 93500
Tweets analysed so far: 94000
Tweets analysed so far: 94500
Tweets analysed so far: 95000
Tweets analysed so far: 95500
Tweets analysed so far: 96000
Tweets analysed so far: 96500
Tweets analysed so far: 97000
Tweets analysed so far: 97500
Tweets analysed so far: 98000
Tweets analysed so far: 98500
Tweets analysed so far: 99000
Tweets analysed so far: 99500
Tweets analysed so far: 100000
Tweets analysed so far: 100500
Tweets analysed so far: 101000
Tweets analysed so far: 101500
Tweets analysed so far: 102000
Tweets analysed so far: 102500
Tweets analysed so far: 103000
Tweets analysed so far: 103500
Tweets analysed so far: 104000
Tweets analysed so far: 104500
Tweets analysed so far: 105000
Tweets analysed so far: 105500
Tweets analysed so far: 106000
Tweets analysed so far: 106

Rate limit exceeded. Sleeping for 764 seconds.


Tweets analysed so far: 120000
Tweets analysed so far: 120500
Tweets analysed so far: 121000
Tweets analysed so far: 121500
Tweets analysed so far: 122000
Tweets analysed so far: 122500
Tweets analysed so far: 123000
Tweets analysed so far: 123500
Tweets analysed so far: 124000
Tweets analysed so far: 124500
Tweets analysed so far: 125000
Tweets analysed so far: 125500
Tweets analysed so far: 126000
Tweets analysed so far: 126500
Tweets analysed so far: 127000
Tweets analysed so far: 127500
Tweets analysed so far: 128000
Tweets analysed so far: 128500
Tweets analysed so far: 129000
Tweets analysed so far: 129500
Tweets analysed so far: 130000
Tweets analysed so far: 130500
Tweets analysed so far: 131000
Tweets analysed so far: 131500
Tweets analysed so far: 132000
Tweets analysed so far: 132500
Tweets analysed so far: 133000
Tweets analysed so far: 133500
Tweets analysed so far: 134000
Tweets analysed so far: 134500
Tweets analysed so far: 135000
Tweets analysed so far: 135500
Tweets a

Rate limit exceeded. Sleeping for 760 seconds.


Tweets analysed so far: 150500
Tweets analysed so far: 151000
Tweets analysed so far: 151500
Tweets analysed so far: 152000
Tweets analysed so far: 152500
Tweets analysed so far: 153000
Tweets analysed so far: 153500
Tweets analysed so far: 154000
Tweets analysed so far: 154500
Tweets analysed so far: 155000
Tweets analysed so far: 155500
Tweets analysed so far: 156000
Tweets analysed so far: 156500
Tweets analysed so far: 157000
Tweets analysed so far: 157500
Tweets analysed so far: 158000
Tweets analysed so far: 158500
Tweets analysed so far: 159000
Tweets analysed so far: 159500
Tweets analysed so far: 160000
Tweets analysed so far: 160500
Tweets analysed so far: 161000
Tweets analysed so far: 161500
Tweets analysed so far: 162000
Tweets analysed so far: 162500
Tweets analysed so far: 163000
Tweets analysed so far: 163500
Tweets analysed so far: 164000
Tweets analysed so far: 164500
Tweets analysed so far: 165000
------------------------------------------------
Real news!
Tweets ana

Rate limit exceeded. Sleeping for 749 seconds.


Tweets analysed so far: 15000
Tweets analysed so far: 15500
Tweets analysed so far: 16000
Tweets analysed so far: 16500
Tweets analysed so far: 17000
Tweets analysed so far: 17500
Tweets analysed so far: 18000
Tweets analysed so far: 18500
Tweets analysed so far: 19000
Tweets analysed so far: 19500
Tweets analysed so far: 20000
Tweets analysed so far: 20500
Tweets analysed so far: 21000
Tweets analysed so far: 21500
Tweets analysed so far: 22000
Tweets analysed so far: 22500
Tweets analysed so far: 23000
Tweets analysed so far: 23500
Tweets analysed so far: 24000
Tweets analysed so far: 24500
Tweets analysed so far: 25000
Tweets analysed so far: 25500
Tweets analysed so far: 26000
Tweets analysed so far: 26500
Tweets analysed so far: 27000
Tweets analysed so far: 27500
Tweets analysed so far: 28000
Tweets analysed so far: 28500
Tweets analysed so far: 29000
Tweets analysed so far: 29500
Tweets analysed so far: 30000
Tweets analysed so far: 30500
Tweets analysed so far: 31000
Tweets ana

Rate limit exceeded. Sleeping for 743 seconds.


Tweets analysed so far: 45000
Tweets analysed so far: 45500
Tweets analysed so far: 46000
Tweets analysed so far: 46500
Tweets analysed so far: 47000
Tweets analysed so far: 47500
Tweets analysed so far: 48000
Tweets analysed so far: 48500
Tweets analysed so far: 49000
Tweets analysed so far: 49500
Tweets analysed so far: 50000
Tweets analysed so far: 50500
Tweets analysed so far: 51000
Tweets analysed so far: 51500
Tweets analysed so far: 52000
Tweets analysed so far: 52500
Tweets analysed so far: 53000
Tweets analysed so far: 53500
Tweets analysed so far: 54000
Tweets analysed so far: 54500
Tweets analysed so far: 55000
Tweets analysed so far: 55500
Tweets analysed so far: 56000
Tweets analysed so far: 56500
Tweets analysed so far: 57000
Tweets analysed so far: 57500
Tweets analysed so far: 58000
Tweets analysed so far: 58500
Tweets analysed so far: 59000
Tweets analysed so far: 59500
Tweets analysed so far: 60000
Tweets analysed so far: 60500
Tweets analysed so far: 61000
Tweets ana

Rate limit exceeded. Sleeping for 748 seconds.


Tweets analysed so far: 75000
Tweets analysed so far: 75500
Tweets analysed so far: 76000
Tweets analysed so far: 76500
Tweets analysed so far: 77000
Tweets analysed so far: 77500
Tweets analysed so far: 78000
Tweets analysed so far: 78500
Tweets analysed so far: 79000
Tweets analysed so far: 79500
Tweets analysed so far: 80000
Tweets analysed so far: 80500
Tweets analysed so far: 81000
Tweets analysed so far: 81500
Tweets analysed so far: 82000
Tweets analysed so far: 82500
Tweets analysed so far: 83000
Tweets analysed so far: 83500
Tweets analysed so far: 84000
Tweets analysed so far: 84500
Tweets analysed so far: 85000
Tweets analysed so far: 85500
Tweets analysed so far: 86000
Tweets analysed so far: 86500
Tweets analysed so far: 87000
Tweets analysed so far: 87500
Tweets analysed so far: 88000
Tweets analysed so far: 88500
Tweets analysed so far: 89000
Tweets analysed so far: 89500
Tweets analysed so far: 90000
Tweets analysed so far: 90500
Tweets analysed so far: 91000
Tweets ana

Rate limit exceeded. Sleeping for 734 seconds.


Tweets analysed so far: 105000
Tweets analysed so far: 105500
Tweets analysed so far: 106000
Tweets analysed so far: 106500
Tweets analysed so far: 107000
Tweets analysed so far: 107500
Tweets analysed so far: 108000
Tweets analysed so far: 108500
Tweets analysed so far: 109000
Tweets analysed so far: 109500
Tweets analysed so far: 110000
Tweets analysed so far: 110500
Tweets analysed so far: 111000
Tweets analysed so far: 111500
Tweets analysed so far: 112000
Tweets analysed so far: 112500
Tweets analysed so far: 113000
Tweets analysed so far: 113500
Tweets analysed so far: 114000
Tweets analysed so far: 114500
Tweets analysed so far: 115000
Tweets analysed so far: 115500
Tweets analysed so far: 116000
Tweets analysed so far: 116500
Tweets analysed so far: 117000
Tweets analysed so far: 117500
Tweets analysed so far: 118000
Tweets analysed so far: 118500
Tweets analysed so far: 119000
Tweets analysed so far: 119500
Tweets analysed so far: 120000
Tweets analysed so far: 120500
Tweets a

Rate limit exceeded. Sleeping for 735 seconds.


Tweets analysed so far: 135000
Tweets analysed so far: 135500
Tweets analysed so far: 136000
Tweets analysed so far: 136500
Tweets analysed so far: 137000
Tweets analysed so far: 137500
Tweets analysed so far: 138000
Tweets analysed so far: 138500
Tweets analysed so far: 139000
Tweets analysed so far: 139500
Tweets analysed so far: 140000
Tweets analysed so far: 140500
Tweets analysed so far: 141000
Tweets analysed so far: 141500
Tweets analysed so far: 142000
Tweets analysed so far: 142500
Tweets analysed so far: 143000
Tweets analysed so far: 143500
Tweets analysed so far: 144000
Tweets analysed so far: 144500
Tweets analysed so far: 145000
Tweets analysed so far: 145500
Tweets analysed so far: 146000
Tweets analysed so far: 146500
Tweets analysed so far: 147000
Tweets analysed so far: 147500
Tweets analysed so far: 148000
Tweets analysed so far: 148500
Tweets analysed so far: 149000
Tweets analysed so far: 149500
Tweets analysed so far: 150000
Tweets analysed so far: 150500
Tweets a

Rate limit exceeded. Sleeping for 751 seconds.


ConnectionError: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /2/tweets?tweet.fields=created_at%2Ctext%2Clang%2Cauthor_id%2Csource&user.fields=location%2Cdescription%2Cverified&expansions=author_id&ids=707804827254923265%2C707805068423200768%2C707805250481233920%2C707805464122298368%2C707805771656986626%2C707805980571074561%2C707805981573517315%2C707806134565015552%2C707806165359529984%2C707806353650262017%2C707807161557098496%2C707807306248024064%2C707807918951899136%2C707808082613706758%2C707808285467013123%2C707808320845914112%2C707808374423994369%2C707808478560124930%2C707808573489815553%2C707809011316428800%2C707809164651790336%2C707809269882732545%2C707809279487647744%2C707809290501885953%2C707809369321316352%2C707809510807769094%2C707809588943429632%2C707809612964167680%2C707809926295523328%2C707811015992479744%2C707811076931522560%2C707811248805703680%2C707811254077882368%2C707811261640261632%2C707811583502712832%2C707811929092444163%2C707812148571979780%2C707812428357214208%2C707812467771109376%2C707812499312287744%2C707812515573571585%2C707813093821251584%2C707813334624686080%2C707813537251520512%2C707813647511396352%2C707813787303350272%2C707814012994588672%2C707817066292781056%2C707824067978645504%2C707833417862230016%2C707833894347927553%2C707835114219122688%2C707835161656868865%2C707839237257691136%2C707839355554013184%2C707850771568599041%2C707851141535690754%2C707852281555714048%2C707864603934400512%2C707865133616201728%2C707865405566656512%2C707866115502792704%2C707883216133169152%2C707888638076407808%2C707889849542389760%2C707892498845323264%2C707902712315305984%2C707916164630974465%2C707918510643609600%2C707923747911524352%2C707924350461026304%2C707939611926994944%2C707955460905836544%2C707967932983173120%2C708000238624251906%2C708000811121442816%2C708004244243107840%2C708015595439202304%2C708080940019486720%2C708081088854274053%2C708150377460269056%2C708150432925917184%2C708388034224324608%2C708400546990977024%2C708735070933274624%2C708970660786208768%2C720787591793098752%2C720824767754801153%2C720840934699638785%2C720841029478256640%2C720841298190606336%2C720841363344855040%2C720841437437235200%2C720841446773751808%2C720841501555560448%2C720841523798016000%2C720841952841740288%2C720842244354220032%2C720842377884094464%2C720842517378301953 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001E88950EAF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [33]:
def remove_date(row):
    return row['Created at'].timetz()

tweets_df['Created at'] = tweets_df.apply(lambda row: remove_date(row), axis=1)

tweets_df.head()

Unnamed: 0,ID,Text,Created at,User location,Num Followers,Num Friends,Num Favourites,User description,User verified,Language
0,937349434668498944,BREAKING: First NFL Team Declares Bankruptcy O...,15:54:54+00:00,"Sugar Land, TX",1712,2751,22141,Ofelia. Arizmendez @ deplorable me. I am a pro...,False,en
1,937379378006282240,BREAKING: First NFL Team Declares Bankruptcy O...,17:53:54+00:00,,14,99,125,,False,en
2,937380068590055425,BREAKING: First NFL Team Declares Bankruptcy O...,17:56:38+00:00,,14,99,125,,False,en
3,937429898670600192,BREAKING: First NFL Team Declares Bankruptcy O...,21:14:39+00:00,Ozarks. Missouri,1162,1258,9422,Happily married conservative Pentecostal woman...,False,en
4,937449906352152576,BREAKING: First NFL Team Declares Bankruptcy O...,22:34:09+00:00,,12772,13708,23094,Deplorable member of The Silenced Majority.. #...,False,en


In [16]:
tweets_df2.to_csv("final_short.csv")

hello
