In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy
import torch

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
#!pip install spacymoji
from spacymoji import Emoji

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

#!pip install transformers
from transformers import AutoModel, BertTokenizerFast

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

## Fetching Tweet data from Twitter

In [None]:
politifact_fake_df = pd.read_csv("data/FakeNewsNet/dataset/politifact_fake.csv")

pf_fake_tweet_ids = politifact_fake_df['tweet_ids'].str.split('\t').explode().tolist()
print(len(pf_fake_tweet_ids))
print()

politifact_real_df = pd.read_csv("data/FakeNewsNet/dataset/politifact_real.csv")
pf_real_tweet_ids = politifact_real_df['tweet_ids'].str.split('\t').explode().tolist()
print(len(pf_real_tweet_ids))

fake_short = pf_fake_tweet_ids[:20]

print(fake_short)

In [None]:
api_key = os.environ.get('APIKey')
api_key_secret = os.environ.get('APIKeySecret')
access_token = os.environ.get('AccessToken')
access_token_secret = os.environ.get('AccessTokenSecret')
bearer_token = os.environ.get('BearerToken')

In [None]:
# Remove nans (don't kill grandmas though)
# Fake
print(len(pf_fake_tweet_ids))
print(np.nan in pf_fake_tweet_ids)
pf_fake_tweet_ids = [x for x in pf_fake_tweet_ids if x is not np.nan]
print(len(pf_fake_tweet_ids))
print(np.nan in pf_fake_tweet_ids)

# Real
print(len(pf_real_tweet_ids))
print(np.nan in pf_real_tweet_ids)
pf_real_tweet_ids = [x for x in pf_real_tweet_ids if x is not np.nan]
print(len(pf_real_tweet_ids))
print(np.nan in pf_real_tweet_ids)

In [None]:
classes = [pf_fake_tweet_ids, pf_real_tweet_ids]
for news_class in classes:
    IDs = news_class
    n = len(IDs)
    for batch_ix in range(0, n, 100):
        if batch_ix % 500 == 0:
            print("Tweets analysed so far:", batch_ix)

        response = client.get_tweets(IDs[batch_ix : min(batch_ix + 100, n)], 
                                     tweet_fields=
                                         ["created_at", "text", "lang", "author_id", "source"],
                                     user_fields=
                                         ["location", "description", "verified"],
                                     expansions="author_id")

        # https://www.kirenz.com/post/2021-12-10-twitter-api-v2-tweepy-and-pandas-in-python/twitter-api-v2-tweepy-and-pandas-in-python/
        # Save data as dictionary
        tweets_dict = response.json() 

        # Extract "data" value from dictionary
        try:
            tweets_data = tweets_dict['data'] # can fail if nans in Tweets
            user_data = tweets_dict['includes']['users']

            # Transform to pandas Dataframe
            df_tweets = pd.json_normalize(tweets_data)
            df_tweets.head()

            # Transform to pandas Dataframe
            df_users = pd.json_normalize(user_data)
            df_users = df_users.rename(columns={'id': 'author_id'})
            df_users.head()

            df = pd.merge(df_tweets, df_users, on=["author_id"])

            # Standardise order of columns
            df = df[['id', 'text', 'author_id', 'source', 'created_at', 'edit_history_tweet_ids', 'lang', 'description', 'username', 'verified', 'name', 'location']]

            include_header = batch_ix == 0
            df.to_csv("real.csv", mode='a', header=include_header)
        except:
            continue

## Connecting original and later-fetched real news

In [4]:
og = pd.read_csv("data/FakeNewsNet/real.csv")
later = pd.read_csv("real.csv")

# frames = [og, later]
# new = pd.concat(frames)

# new.to_csv("real_new.csv")

new = pd.read_csv("real_new.csv")

117009
179162
296171


In [6]:
print(len(og))
print(len(later))
print(len(new))
print(len(og) + len(later) == len(new))

117009
179162
296171
True


In [46]:
fake = pd.read_csv("fake_with_article.csv")
print(len(fake))

108931


## Add news article ID to dataset

In [44]:
og_real = pd.read_csv("data/FakeNewsNet/dataset/politifact_real.csv")
real = pd.read_csv("data/FakeNewsNet/real.csv", dtype={"id": str})

og_fake = pd.read_csv("data/FakeNewsNet/dataset/politifact_fake.csv")
fake = pd.read_csv("data/FakeNewsNet/fake.csv", dtype={"id": str})

fake['article_id'] = ""

for row_ix, row in og_fake.iterrows():
    if row_ix % 50 == 0:
            print(f"Got through {row_ix} rows!")
    print(row['id'])
    if row['tweet_ids'] != "" and row['tweet_ids'] is not np.nan:
        tweet_IDs = row['tweet_ids'].split("\t")
        for tweet_ID in tweet_IDs:
            fake.loc[fake['id'] == tweet_ID, 'article_id'] = row['id']
            
fake.to_csv("fake_with_article.csv")
fake.head()

Got through 0 rows!
politifact15014
politifact15156
politifact14745
politifact14355
politifact15371
politifact14404
politifact13919
politifact14795
politifact14328
politifact13775
politifact14678
politifact14394
politifact14376
politifact14233
politifact14890
politifact14356
politifact14776
politifact15355
politifact14664
politifact15178
politifact15052
politifact15309
politifact15267
politifact13836
politifact15553
politifact14755
politifact13806
politifact15164
politifact14388
politifact14742
politifact14544
politifact13677
politifact15280
politifact14310
politifact15130
politifact14694
politifact14863
politifact15004
politifact13773
politifact14258
politifact14235
politifact15251
politifact14940
politifact13687
politifact14305
politifact15367
politifact14862
politifact13949
politifact15591
politifact13887
Got through 50 rows!
politifact15262
politifact15525
politifact13589
politifact14005
politifact15271
politifact13565
politifact15354
politifact14426
politifact14565
politifact14548

Unnamed: 0.1,Unnamed: 0,id,text,author_id,source,created_at,edit_history_tweet_ids,lang,description,username,verified,name,location,article_id
0,0,937349434668498944,BREAKING: First NFL Team Declares Bankruptcy O...,4219197432,Twitter Web Client,2017-12-03T15:54:54.000Z,['937349434668498944'],en,Ofelia. Arizmendez @ deplorable me. I am a pro...,OfeliasHeaven,False,Ofelia Duchess Arizmendez,"Sugar Land, TX",politifact15014
1,1,937379378006282240,BREAKING: First NFL Team Declares Bankruptcy O...,3018973429,Facebook,2017-12-03T17:53:54.000Z,['937379378006282240'],en,,lorn_cramer,False,Lorn Cramer,,politifact15014
2,2,937380068590055425,BREAKING: First NFL Team Declares Bankruptcy O...,3018973429,Facebook,2017-12-03T17:56:38.000Z,['937380068590055425'],en,,lorn_cramer,False,Lorn Cramer,,politifact15014
3,3,937429898670600192,BREAKING: First NFL Team Declares Bankruptcy O...,23162382,Twitter for iPhone,2017-12-03T21:14:39.000Z,['937429898670600192'],en,Happily married conservative Pentecostal woman...,starchaser57,False,🌹Star Chaser 🌼🇺🇸,Ozarks. Missouri,politifact15014
4,4,937449906352152576,BREAKING: First NFL Team Declares Bankruptcy O...,1409084934,Twitter Web Client,2017-12-03T22:34:09.000Z,['937449906352152576'],en,Deplorable member of The Silenced Majority.. #...,ThePipeStore,False,Mr. Walker 🇺🇸,,politifact15014


In [38]:
real['article_id'].value_counts()
#real['id'].value_counts()

politifact12104    19749
politifact10371    13720
politifact1201     12618
politifact8005     11755
politifact13469    11049
                   ...  
politifact11777        1
politifact463          1
politifact1038         1
politifact495          1
politifact7258         1
Name: article_id, Length: 395, dtype: int64

In [42]:
real["article_id"].nunique()

395

In [45]:
fake["article_id"].nunique()

384

## Split dataset by groups

In [49]:
real = pd.read_csv("real_with_article.csv")
real.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,text,author_id,source,created_at,edit_history_tweet_ids,lang,description,username,verified,name,location,article_id
0,0,0,0,967132259869487105,"""According to a new survey by the National Fed...",868603454,Hootsuite,2018-02-23T20:21:13.000Z,['967132259869487105'],en,I have been advocating the interests of small ...,OwensNFIB,False,Charles Owens,,politifact14984
1,1,1,1,969607746898153473,A record number of small business owners are s...,868603454,Hootsuite,2018-03-02T16:17:55.000Z,['969607746898153473'],en,I have been advocating the interests of small ...,OwensNFIB,False,Charles Owens,,politifact14984
2,2,2,2,971815809260707841,“A record number of small business owners said...,868603454,Hootsuite,2018-03-08T18:31:59.000Z,['971815809260707841'],en,I have been advocating the interests of small ...,OwensNFIB,False,Charles Owens,,politifact14984
3,3,3,3,974309878863888389,"“Yesterday, the National Federation of Indepen...",868603454,Hootsuite,2018-03-15T15:42:31.000Z,['974309878863888389'],en,I have been advocating the interests of small ...,OwensNFIB,False,Charles Owens,,politifact14984
4,4,4,4,976498012246740992,“We have seen a real turnaround in the outlook...,868603454,Hootsuite,2018-03-21T16:37:23.000Z,['976498012246740992'],en,I have been advocating the interests of small ...,OwensNFIB,False,Charles Owens,,politifact14984


In [51]:
# Experimentation - not needed
prev_article = ""
article_counter = 0
for row_ix, row in real.iterrows():
    if row['article_id'] != prev_article:
        print(f"New article at row index {row_ix}!")
        article_counter += 1
        print("Articles so far:", article_counter)
        prev_article = row['article_id']

New article at row index 0!
Articles so far: 1
New article at row index 226!
Articles so far: 2
New article at row index 227!
Articles so far: 3
New article at row index 880!
Articles so far: 4
New article at row index 928!
Articles so far: 5
New article at row index 931!
Articles so far: 6
New article at row index 933!
Articles so far: 7
New article at row index 934!
Articles so far: 8
New article at row index 935!
Articles so far: 9
New article at row index 959!
Articles so far: 10
New article at row index 1036!
Articles so far: 11
New article at row index 2767!
Articles so far: 12
New article at row index 3035!
Articles so far: 13
New article at row index 3081!
Articles so far: 14
New article at row index 3094!
Articles so far: 15
New article at row index 3147!
Articles so far: 16
New article at row index 3194!
Articles so far: 17
New article at row index 4452!
Articles so far: 18
New article at row index 4454!
Articles so far: 19
New article at row index 4484!
Articles so far: 20
N

New article at row index 9826!
Articles so far: 208
New article at row index 9830!
Articles so far: 209
New article at row index 11434!
Articles so far: 210
New article at row index 12393!
Articles so far: 211
New article at row index 14311!
Articles so far: 212
New article at row index 14313!
Articles so far: 213
New article at row index 15074!
Articles so far: 214
New article at row index 15075!
Articles so far: 215
New article at row index 15258!
Articles so far: 216
New article at row index 15259!
Articles so far: 217
New article at row index 15306!
Articles so far: 218
New article at row index 15343!
Articles so far: 219
New article at row index 15742!
Articles so far: 220
New article at row index 15743!
Articles so far: 221
New article at row index 15758!
Articles so far: 222
New article at row index 15760!
Articles so far: 223
New article at row index 15778!
Articles so far: 224
New article at row index 15781!
Articles so far: 225
New article at row index 15786!
Articles so far:

New article at row index 26334!
Articles so far: 405
New article at row index 26335!
Articles so far: 406
New article at row index 26338!
Articles so far: 407
New article at row index 26339!
Articles so far: 408
New article at row index 26341!
Articles so far: 409
New article at row index 26342!
Articles so far: 410
New article at row index 26399!
Articles so far: 411
New article at row index 26400!
Articles so far: 412
New article at row index 26414!
Articles so far: 413
New article at row index 26415!
Articles so far: 414
New article at row index 26515!
Articles so far: 415
New article at row index 26516!
Articles so far: 416
New article at row index 26518!
Articles so far: 417
New article at row index 26519!
Articles so far: 418
New article at row index 26525!
Articles so far: 419
New article at row index 26526!
Articles so far: 420
New article at row index 26527!
Articles so far: 421
New article at row index 26528!
Articles so far: 422
New article at row index 26535!
Articles so fa

New article at row index 38027!
Articles so far: 592
New article at row index 38028!
Articles so far: 593
New article at row index 40303!
Articles so far: 594
New article at row index 41671!
Articles so far: 595
New article at row index 42856!
Articles so far: 596
New article at row index 42857!
Articles so far: 597
New article at row index 42904!
Articles so far: 598
New article at row index 42905!
Articles so far: 599
New article at row index 42975!
Articles so far: 600
New article at row index 42976!
Articles so far: 601
New article at row index 43289!
Articles so far: 602
New article at row index 43290!
Articles so far: 603
New article at row index 43357!
Articles so far: 604
New article at row index 43549!
Articles so far: 605
New article at row index 44103!
Articles so far: 606
New article at row index 44105!
Articles so far: 607
New article at row index 44111!
Articles so far: 608
New article at row index 45101!
Articles so far: 609
New article at row index 45854!
Articles so fa

New article at row index 76137!
Articles so far: 769
New article at row index 76138!
Articles so far: 770
New article at row index 76552!
Articles so far: 771
New article at row index 76553!
Articles so far: 772
New article at row index 76596!
Articles so far: 773
New article at row index 76601!
Articles so far: 774
New article at row index 76604!
Articles so far: 775
New article at row index 76609!
Articles so far: 776
New article at row index 76610!
Articles so far: 777
New article at row index 76611!
Articles so far: 778
New article at row index 76622!
Articles so far: 779
New article at row index 76624!
Articles so far: 780
New article at row index 76626!
Articles so far: 781
New article at row index 76629!
Articles so far: 782
New article at row index 76772!
Articles so far: 783
New article at row index 76775!
Articles so far: 784
New article at row index 76776!
Articles so far: 785
New article at row index 76781!
Articles so far: 786
New article at row index 76933!
Articles so fa

New article at row index 85707!
Articles so far: 981
New article at row index 85720!
Articles so far: 982
New article at row index 85729!
Articles so far: 983
New article at row index 85953!
Articles so far: 984
New article at row index 85954!
Articles so far: 985
New article at row index 86177!
Articles so far: 986
New article at row index 86178!
Articles so far: 987
New article at row index 86305!
Articles so far: 988
New article at row index 86306!
Articles so far: 989
New article at row index 86386!
Articles so far: 990
New article at row index 86444!
Articles so far: 991
New article at row index 86445!
Articles so far: 992
New article at row index 86810!
Articles so far: 993
New article at row index 86932!
Articles so far: 994
New article at row index 87911!
Articles so far: 995
New article at row index 91456!
Articles so far: 996
New article at row index 91457!
Articles so far: 997
New article at row index 91626!
Articles so far: 998
New article at row index 91627!
Articles so fa

New article at row index 116335!
Articles so far: 1221
New article at row index 116346!
Articles so far: 1222
New article at row index 116356!
Articles so far: 1223
New article at row index 116357!
Articles so far: 1224
New article at row index 116370!
Articles so far: 1225
New article at row index 116409!
Articles so far: 1226
New article at row index 116452!
Articles so far: 1227
New article at row index 116453!
Articles so far: 1228
New article at row index 116461!
Articles so far: 1229
New article at row index 116462!
Articles so far: 1230
New article at row index 116508!
Articles so far: 1231
New article at row index 116509!
Articles so far: 1232
New article at row index 116512!
Articles so far: 1233
New article at row index 116513!
Articles so far: 1234
New article at row index 116518!
Articles so far: 1235
New article at row index 116519!
Articles so far: 1236
New article at row index 116525!
Articles so far: 1237
New article at row index 116526!
Articles so far: 1238
New articl

New article at row index 154835!
Articles so far: 1398
New article at row index 154836!
Articles so far: 1399
New article at row index 154868!
Articles so far: 1400
New article at row index 154884!
Articles so far: 1401
New article at row index 154906!
Articles so far: 1402
New article at row index 154935!
Articles so far: 1403
New article at row index 155328!
Articles so far: 1404
New article at row index 155336!
Articles so far: 1405
New article at row index 163993!
Articles so far: 1406
New article at row index 163994!
Articles so far: 1407
New article at row index 164117!
Articles so far: 1408
New article at row index 164118!
Articles so far: 1409
New article at row index 164274!
Articles so far: 1410
New article at row index 164275!
Articles so far: 1411
New article at row index 164379!
Articles so far: 1412
New article at row index 164380!
Articles so far: 1413
New article at row index 164477!
Articles so far: 1414
New article at row index 164481!
Articles so far: 1415
New articl

New article at row index 172518!
Articles so far: 1634
New article at row index 172539!
Articles so far: 1635
New article at row index 172540!
Articles so far: 1636
New article at row index 173534!
Articles so far: 1637
New article at row index 174388!
Articles so far: 1638
New article at row index 174391!
Articles so far: 1639
New article at row index 174393!
Articles so far: 1640
New article at row index 174426!
Articles so far: 1641
New article at row index 175246!
Articles so far: 1642
New article at row index 180481!
Articles so far: 1643
New article at row index 180482!
Articles so far: 1644
New article at row index 184881!
Articles so far: 1645
New article at row index 184882!
Articles so far: 1646
New article at row index 184956!
Articles so far: 1647
New article at row index 184957!
Articles so far: 1648
New article at row index 185172!
Articles so far: 1649
New article at row index 185173!
Articles so far: 1650
New article at row index 185575!
Articles so far: 1651
New articl

New article at row index 240185!
Articles so far: 1810
New article at row index 240344!
Articles so far: 1811
New article at row index 240974!
Articles so far: 1812
New article at row index 240989!
Articles so far: 1813
New article at row index 240991!
Articles so far: 1814
New article at row index 240997!
Articles so far: 1815
New article at row index 241439!
Articles so far: 1816
New article at row index 241449!
Articles so far: 1817
New article at row index 242331!
Articles so far: 1818
New article at row index 242332!
Articles so far: 1819
New article at row index 242333!
Articles so far: 1820
New article at row index 242365!
Articles so far: 1821
New article at row index 242640!
Articles so far: 1822
New article at row index 246552!
Articles so far: 1823
New article at row index 246553!
Articles so far: 1824
New article at row index 249082!
Articles so far: 1825
New article at row index 249888!
Articles so far: 1826
New article at row index 250015!
Articles so far: 1827
New articl

In [23]:
# Combine dataset classes
real_df = pd.read_csv("real_with_article.csv")
fake_df = pd.read_csv("fake_with_article.csv")
fake_df['true'] = False
real_df['true'] = True
df = pd.concat([fake_df, real_df])

In [24]:
print(len(df[df['true'] == True]))
print(len(df[df['true'] == False]))

296171
108931


In [25]:
# Fix imbalanced data
# https://elitedatascience.com/imbalanced-classes

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.true==True]
df_minority = df[df.true==False]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.true.value_counts()

True     108931
False    108931
Name: true, dtype: int64

In [41]:
#https://stackoverflow.com/questions/61337373/split-on-train-and-test-separating-by-group
#from sklearn.model_selection import GroupShuffleSplit

# Update df to the downsampled one

X = df_downsampled.drop(labels='true', axis=1)
y = df_downsampled.true


# Split in training/testing data but in a way that Tweets referring to the same article don't appear in both datasets
gs = GroupShuffleSplit(n_splits=2, test_size=.6, random_state=0)
train_ix, test_ix = next(gs.split(X, y, groups=X.article_id))

print(type(train_ix))

X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_test = X.iloc[test_ix]
y_test = y.iloc[test_ix]

X_train.head()

<class 'numpy.ndarray'>


Unnamed: 0.3,Unnamed: 0.1,Unnamed: 0,id,text,author_id,source,created_at,edit_history_tweet_ids,lang,description,username,verified,name,location,article_id,Unnamed: 0.2
204478,87469,59,1005223100793753600,Responsibilities require you to be one hundred...,923926573,Twittascope,2018-06-08T23:00:38.000Z,['1005223100793753600'],en,She got an ass that could swallow up a g-string,amy1thomas,False,lembit opik,,politifact11399,204478.0
210630,93621,39,1055770089222537216,#octinbooks18 Day Fourteen: Hogwarts Class You...,2661915614,Instagram,2018-10-26T10:36:19.000Z,['1055770089222537216'],en,,darkflamereads,False,Chloe,Australia,politifact11399,210630.0
96763,96763,35,1039820996239286272,Read the full transcript of Obama’s fiery anti...,62533108,Twitter Web Client,2018-09-12T10:20:19.000Z,['1039820996239286272'],en,I may be small but I bite.,wellduuhn,False,Dan,Feelpeens,politifact512,96763.0
258305,141296,40,1071394994592907264,"In 2004, a one-term senator from Illinois took...",147689176,Facebook,2018-12-08T13:24:06.000Z,['1071394994592907264'],en,Dawgen Global is an integrated multidisciplina...,dawgenja,False,Dawgen Global,Caribbean Head Office -Jamaica,politifact671,258305.0
95357,95357,47,1022953962423701504,@steve_durnan @ThomasSowell Sure did and I not...,898272197407584257,Twitter Web Client,2018-07-27T21:16:44.000Z,['1022953962423701504'],en,"Climate, war, and inequality are linked at the...",Tav_assoli,False,Climate Clock,Spaceship Earth,politifact923,95357.0


In [42]:
X_train["article_id"].value_counts()

politifact13949    4911
politifact14742    3548
politifact11399    3357
politifact954      3109
politifact671      3038
                   ... 
politifact429         1
politifact2298        1
politifact740         1
politifact1038        1
politifact1277        1
Name: article_id, Length: 294, dtype: int64

In [33]:
X_test["article_id"].value_counts()

politifact14548    14775
politifact14667     6999
politifact12104     5702
politifact10371     3617
politifact13949     3502
                   ...  
politifact429          1
politifact2298         1
politifact740          1
politifact773          1
politifact15327        1
Name: article_id, Length: 615, dtype: int64

In [43]:
# Quick ML
# Get features for TF-IDF

train_text = X_train['text']
train_labels = y_train

test_text = X_test['text']
test_labels = y_test

tfidf = TfidfVectorizer()#tokenizer=spacy_tokeniser)
tfidf.fit(train_text)
train_features = tfidf.transform(train_text)
#validation_features = tfidf.transform(val_text)
test_features = tfidf.transform(test_text)

def evaluation_summary(description, true_labels, predictions):
  print("Evaluation for: " + description)
  print(classification_report(true_labels, predictions,  digits=3, zero_division=0))
  print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))

# SVC
svc = SVC(kernel='rbf')
svc_model = svc.fit(train_features, train_labels)
# print(svc_model.score(validation_features, val_labels))
# svc_predicted_labels = svc_model.predict(validation_features)

#evaluation_summary("SVC", val_labels, svc_predicted_labels)
svc_test = svc_model.predict(test_features)
evaluation_summary("SVC test", test_labels, svc_test)

# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression()
lr_model_tfidf = lr_tfidf.fit(train_features, train_labels)
# print(lr_model_tfidf.score(validation_features, val_labels))
print(lr_model_tfidf.score(test_features, test_labels))

# lr_predicted_labels_tfidf = lr_model_tfidf.predict(validation_features)
# evaluation_summary("LR (TF-IDF)", val_labels, lr_predicted_labels_tfidf)
lrtfidf_test = lr_model_tfidf.predict(test_features)
evaluation_summary("LR (TF-IDF) test", test_labels, lrtfidf_test)

# Dummy Majority
dumb = DummyClassifier(strategy='most_frequent')
dumb.fit(train_features, train_labels)
# print(dumb.score(validation_features, val_labels))
# dumb_validation_predicted_labels = dumb.predict(validation_features)
# evaluation_summary("Dummy majority", val_labels, dumb_validation_predicted_labels)
dumb_test = dumb.predict(test_features)
evaluation_summary("Dummy MF test", test_labels, dumb_test)

Evaluation for: SVC test
              precision    recall  f1-score   support

       False      0.816     0.285     0.422     74981
        True      0.572     0.937     0.711     76622

    accuracy                          0.614    151603
   macro avg      0.694     0.611     0.566    151603
weighted avg      0.693     0.614     0.568    151603


Confusion matrix:
 [[21340 53641]
 [ 4824 71798]]
0.6801844290680263
Evaluation for: LR (TF-IDF) test
              precision    recall  f1-score   support

       False      0.823     0.450     0.582     74981
        True      0.627     0.905     0.741     76622

    accuracy                          0.680    151603
   macro avg      0.725     0.678     0.662    151603
weighted avg      0.724     0.680     0.662    151603


Confusion matrix:
 [[33752 41229]
 [ 7256 69366]]
Evaluation for: Dummy MF test
              precision    recall  f1-score   support

       False      0.495     1.000     0.662     74981
        True      0.000     

In [71]:
# Dummy Majority
dumb = DummyClassifier(strategy='most_frequent')
dumb.fit(train_features, train_labels)
# print(dumb.score(validation_features, val_labels))
# dumb_validation_predicted_labels = dumb.predict(validation_features)
# evaluation_summary("Dummy majority", val_labels, dumb_validation_predicted_labels)
dumb_test = dumb.predict(test_features)
evaluation_summary("Dummy MF test", test_labels, dumb_test)

Evaluation for: Dummy MF test
              precision    recall  f1-score   support

       False      0.000     0.000     0.000     82532
        True      0.683     1.000     0.811    177584

    accuracy                          0.683    260116
   macro avg      0.341     0.500     0.406    260116
weighted avg      0.466     0.683     0.554    260116


Confusion matrix:
 [[     0  82532]
 [     0 177584]]
