# Import All Packages




In [1]:
    import random
    import pandas as pd
    import numpy as np
    import string
    from string import digits
    from sklearn.model_selection import train_test_split
    import torch
    import torch.nn as nn
    from sklearn.metrics import classification_report
    import transformers
    from transformers import AutoModel, BertTokenizerFast
    from ipywidgets import IntProgress
    from tqdm import tqdm



# Import BERT Model, BERT Tokenizer and Torch

In [2]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# import Torch
device = torch.device("cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Predefined CLass

In [3]:
class Propaganda:
    NEGATIVE = 0
    POSITIVE = 1

class Review:
    def __init__(self,sentence,SUBJprop):
        self.sentence = sentence
        self.SUBJprop = SUBJprop
        self.propaganda = SUBJprop


class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_sentence(self):
        return [x.sentence for x  in self.reviews]

    def get_propaganda(self):
        return [int(x.propaganda) for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), self.reviews))
        positive = list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), self.reviews))
        negative_shrunk = negative[:len(positive)]
        self.reviews = positive + negative_shrunk
        random.shuffle(self.reviews)

# Split train dataset into train, validation and test sets

In [4]:
# step 2.1: Load Data
reviews = []
data  = pd.read_excel('Data/finalDataset.xlsx', engine='openpyxl')
df = pd.DataFrame(data.astype(str) , columns = ['Sentence','SUBJprop'])
# iterate elements of attribute "Sentence" and "SUBJprop" and push to the array "reviews"

for index, row in df.iterrows():
    sentence = row['Sentence']
    prop = row['SUBJprop']
    reviews.append(Review(sentence,prop))

print("Total Rows:")
print(len(reviews))
print("Total Positive:")
print(len(list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), reviews))))
print("Total Negative:")
print(len(list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), reviews))))

Total Rows:
14058
Total Positive:
3904
Total Negative:
10154


## Split dataset

In [5]:
neg_prop = list(filter(lambda x: x.propaganda == str(Propaganda.NEGATIVE), reviews))
pos_prop = list(filter(lambda x: x.propaganda == str(Propaganda.POSITIVE), reviews))
########################################################################################
#split trainig and DevTest dataset
neg_train, neg_devtest  = train_test_split(neg_prop , train_size=0.7, shuffle= False )
pos_train, pos_devtest = train_test_split(pos_prop , train_size=0.7, shuffle= False )
########################################################################################
#prepare training dataset
train = neg_train + pos_train
#random.shuffle(train)
########################################################################################
#prepare development and test dataset
neg_dev, neg_test = train_test_split(neg_devtest , train_size=0.5, shuffle= False )
pos_dev, pos_test = train_test_split(pos_devtest , train_size=0.5, shuffle= False )

dev = neg_dev + pos_dev
#random.shuffle(dev)

test = neg_test + pos_test
#random.shuffle(test)

In [6]:
# step 3: Seperate the attribute, originally our array has text and score. we want them to be a seperate array
train_container = ReviewContainer(train)
train_container.evenly_distribute()

train_text = train_container.get_sentence()   
train_labels = train_container.get_propaganda() 

dev_text = [x.sentence for x in dev]
dev_labels = [int(x.propaganda) for x in dev]

test_text = [x.sentence for x in test]
test_labels = [int(x.propaganda) for x in test]



### Print Sample Output

In [7]:
print('Total Train Records:')
print(len(train_text))
print('Negative Train Records:')
print(train_labels.count(0))
print('Positive Train Records:')
print(train_labels.count(1))
print('\n')
print('Total Dev Records:')
print(len(dev_text))
print('Negative Dev Records:')
print(dev_labels.count(0))
print('Positive Dev Records:')
print(dev_labels.count(1))
print('\n')
print('Total Test Records:')
print(len(test_text))
print('Negative Test Records:')
print(test_labels.count(0))
print('Positive Test Records:')
print(test_labels.count(1))

Total Train Records:
5464
Negative Train Records:
2732
Positive Train Records:
2732


Total Dev Records:
2109
Negative Dev Records:
1523
Positive Dev Records:
586


Total Test Records:
2110
Negative Test Records:
1524
Positive Test Records:
586


## Save splited Data to seprate excel files

In [8]:
# Create Excel file of train dataset
train_df = pd.DataFrame(train_text, columns=["Sentence"])
train_df['SUBJprop'] = train_labels
train_df.to_excel("./Data/trainDataset.xlsx")
# Create Excel file of dev dataset
dev_df = pd.DataFrame(dev_text, columns=["Sentence"])
dev_df['SUBJprop'] = dev_labels
dev_df['Tanbih'] = ""
dev_df.to_excel("./Data/devDataset.xlsx")
# Create Excel file of test dataset
test_df = pd.DataFrame(test_text, columns=["Sentence"])
test_df['SUBJprop'] = test_labels
test_df.to_excel("./Data/testDataset.xlsx")

# Tokenization and Filtering Punctuation

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizerNLTK = RegexpTokenizer(r'\w+')

train_text_tokenized = []
dev_text_tokenized = []
test_text_tokenized = []


for i in range(len(train_text)):
    train_text_tokenized.append(tokenizerNLTK.tokenize(train_text[i])) 

for i in range(len(dev_text)):
    dev_text_tokenized.append(tokenizerNLTK.tokenize(dev_text[i])) 

for i in range(len(test_text)):
    test_text_tokenized.append(tokenizerNLTK.tokenize(test_text[i]))

### Print Sample Output

In [11]:
print("Before Sentence Tokenization:")
print(train_text[0:3])
print("After Sentence Tokenization:")
print(train_text_tokenized[0:3])

Before Sentence Tokenization:
['One way or another, the Church will have to repel an attacker at her very summit.\n', 'As a firearms expert, he also questioned the lack of flashes coming from the hotel windows where the shooter was supposedly firing from.\n', 'The investigation has been conducted by local and state police, not the feds, and yet, they continue to claim there is not Islamic terrorist involvement.\n']
After Sentence Tokenization:
[['One', 'way', 'or', 'another', 'the', 'Church', 'will', 'have', 'to', 'repel', 'an', 'attacker', 'at', 'her', 'very', 'summit'], ['As', 'a', 'firearms', 'expert', 'he', 'also', 'questioned', 'the', 'lack', 'of', 'flashes', 'coming', 'from', 'the', 'hotel', 'windows', 'where', 'the', 'shooter', 'was', 'supposedly', 'firing', 'from'], ['The', 'investigation', 'has', 'been', 'conducted', 'by', 'local', 'and', 'state', 'police', 'not', 'the', 'feds', 'and', 'yet', 'they', 'continue', 'to', 'claim', 'there', 'is', 'not', 'Islamic', 'terrorist', 'inv

# Repetition

In [None]:
# train_text2=[]
# SentimentWords

# def checkIfDuplicates(listOfElems):
#     if len(listOfElems) == len(set(listOfElems)):
#         return False
#     else:
#         return True

# for sentence in train_text1:
#     temp=[]
#     if checkIfDuplicates(sentence):
#         print(sentence)



    # for word in sentence:
    #     temp.append(word)

    # train_text2.append(temp)

# print(train_text2[:10])

# Import Dictionaries

In [12]:
# step 2.2: Load Sentimental Data
SentimentWords= []
SentimentValue= []


Sentimentdata  = pd.read_excel('Data/DictionaryWords.xlsx', engine='openpyxl')
df = pd.DataFrame(Sentimentdata.astype(str) , columns = ['word','value'])

for index, row in df.iterrows():
    word = row['word']
    value = row['value']
    SentimentWords.append(word)
    SentimentValue.append(value)

# Removing stop words

In [13]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()


# Create list of word tokens after removing stopwords in train_text
train_text1=[]
for sentence in train_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word) 
    train_text1.append(temp)

# print(train_text1[:100])
# print("*****************************************************")

# Create list of word tokens after removing stopwords in dev_text
dev_text1 =[]

for sentence in dev_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word) 
    dev_text1.append(temp)
    
# print(dev_text1[:100])
# print("*****************************************************")

# # Create list of word tokens after removing stopwords in test_text
test_text1 =[]

for sentence in test_text_tokenized:
    temp=[]
    for word in sentence:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            temp.append(word)
    test_text1.append(temp)
# print(test_text1[:100])


### Print Sample Output

In [14]:
print(train_labels[8])
print('Before Removing Stopwords:')    
print(train_text_tokenized[8])
print('\nAfter Removing Stopwords:') 
print(train_text1[8])

0
Before Removing Stopwords:
['Who', 'should', 'replace', 'Nikki', 'Haley', 'as', 'our', 'ambassador', 'to', 'the', 'U', 'N']

After Removing Stopwords:
['replace', 'Nikki', 'Haley', 'ambassador', 'U', 'N']


# Duplicate Sentimenal words

In [16]:
train_text2=[]

for sentence in train_text1:
    temp=[]
    for word in sentence:
        temp.append(word)
        for SentimentWord in SentimentWords:
            if word.lower()==SentimentWord or word==SentimentWord:
                temp.append(word)
                temp.append(word)
    train_text2.append(temp)

### Print Sample Output

In [17]:
# print("Propoganda:")
# print(train_labels[:64])
print('Before Duplication:')
print(train_text1[64])
print('After Duplication:')
print(train_text2[64])

Before Duplication:
['Muhammad', 'advises', 'thing', 'according', 'tradition', 'Muslim', 'doctors', 'wherefore', 'great', 'Temur', 'strove', 'exterminate', 'infidels', 'acquire', 'glory', 'signalise', 'greatness', 'conquests']
After Duplication:
['Muhammad', 'advises', 'thing', 'according', 'tradition', 'Muslim', 'Muslim', 'Muslim', 'doctors', 'wherefore', 'great', 'great', 'great', 'Temur', 'strove', 'exterminate', 'infidels', 'infidels', 'infidels', 'acquire', 'glory', 'signalise', 'greatness', 'conquests']


# Detokenizer

In [30]:
train_text_detokenized=[]
for i in range(len(train_text2)):
    temp=""
    for j in range(len(train_text2[i])):
        temp=temp + " " + train_text2[i][j]        
    train_text_detokenized.append(temp)
# print(train_text_detokenized[:100])

dev_text_detokenized=[]
for i in range(len(dev_text1)):
    temp=""
    for j in range(len(dev_text1[i])):
        temp=temp + " " + dev_text1[i][j]        
    dev_text_detokenized.append(temp)
# print(dev_text_detokenized[:100])

test_text_detokenized=[]
for i in range(len(test_text1)):
    temp=""
    for j in range(len(test_text1[i])):
        temp=temp + " " + test_text1[i][j]        
    test_text_detokenized.append(temp)
# print(test_text_detokenized[:100])

### Print Sample Output

In [31]:
print(train_text_detokenized[60:64])

[' Committee Republicans considered political political political optics didn t relish prospect male team 11 senators questioning woman hired sex crimes prosecutor Rachel Mitchell Maricopa County Arizona Arizona Arizona ask questions promises highly watched hearing', ' Swiss bishop bishop bishop signs statement statement statement calling Pope s reading Amoris Laetitia alien Catholic Catholic Catholic Faith Faith Faith', ' called Francis Francis Francis resignation know degree certitude lawyer seeks Viganò s key allegations Francis Francis Francis substantially true assuredly reach conclusion true hesitation Francis Francis Francis resign', ' BAR SAFE SAFE SAFE SPACE']


In [28]:
max_seq_len = 119
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text_detokenized,
    # train_text2,
    # is_split_into_words=True,
    max_length = max_seq_len,
    pad_to_max_length=True,
    # padding='longest',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    dev_text_detokenized,
    # dev_text1,
    # is_split_into_words=True,
    max_length = max_seq_len,
    pad_to_max_length=True,
    # padding='longest',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text_detokenized,
    # test_text1,
    # is_split_into_words=True,
    max_length = max_seq_len,
    pad_to_max_length=True,
    # padding=True,
    truncation=True,
    return_token_type_ids=False
)

# Convert Integer Sequences to Tensors

In [29]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)



# for validation set
dev_seq = torch.tensor(tokens_val['input_ids'])
dev_mask = torch.tensor(tokens_val['attention_mask'])
dev_y = torch.tensor(dev_labels)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)

ValueError: expected sequence of length 10 at dim 1 (got 15)

### Print Sample Output

In [22]:
print(train_seq[1])
print(train_mask[1])
print(train_y[1])
print(train_labels[1])
# print(test_seq)
# print(test_mask)
# print(test_y)

tensor([  101, 13780,  6739,  8781,  3768, 16121,  2746,  3309,  3645, 13108,
        10743, 10743, 10743,  7493,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,

# Classifiers

## SVM

In [23]:

from sklearn import svm

clf_svm = svm.SVC(kernel='rbf',C=1, probability=True, gamma=0.00000001)
clf_svm.fit(train_seq, train_y)

# clf_svm.predict(train_seq)

SVC(C=1, gamma=1e-08, probability=True)

### Hyperparameter Tune using Training Data for SVM

In [24]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
train_seq, test_seq, train_y, test_y = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(train_seq, train_y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = test_y, clf.predict(test_seq)
    print(classification_report(y_true, y_pred))
    print()

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.028) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.974 (+/-0.012) for {'C': 1, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 10, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 100, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model 

### Sample Output

In [25]:
test_svm_predict = clf_svm.predict(test_seq)
print("Actual Lables:")
print(test_labels[2000:2020])
print("Predicted Lables:")
print(test_svm_predict[2000:2020])

print("\n")
print("Actual Lables:")
print(test_labels[:20])
print("Predicted Lables:")
print(test_svm_predict[:20])

ValueError: X.shape[1] = 64 should be equal to 119, the number of features at training time

## Decision Tree

In [None]:

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier(random_state=0)
clf_dec.fit(train_seq , train_y)

# clf_dec.predict(test_seq)

In [None]:
test_dec_predict = clf_dec.predict(test_seq)
print("Actual Lables:")
print(test_labels[2000:2050])
print("Predicted Lables:")
print(test_dec_predict[2000:2050])

print("\n")
print("Actual Lables:")
print(test_labels[:50])
print("Predicted Lables:")
print(test_dec_predict[:50])

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB(var_smoothing=0.2848035868435802)
clf_gnb.fit(train_seq , train_y)

# clf_gnb.predict(test_seq)

### Hyperparameter Tune using Training Data for Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(train_seq, train_y)
print(nbModel_grid.best_estimator_)

### Sample Output

In [None]:
test_gnb_predict = clf_gnb.predict(test_seq)
print("Actual Lables:")
print(test_labels[2000:2050])
print("Predicted Lables:")
print(test_gnb_predict[2000:2050])

print("\n")
print("Actual Lables:")
print(test_labels[:10])
print("Predicted Lables:")
print(test_gnb_predict[:10])

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_seq, train_y)

# clf_log.predict(test_seq)

In [None]:
test_log_predict = clf_log.predict(test_seq)
print("Actual Lables:")
print(test_labels[2000:2010])
print("Predicted Lables:")
print(test_log_predict[2000:2010])

print("\n")
print("Actual Lables:")
print(test_labels[:10])
print("Predicted Lables:")
print(test_log_predict[:10])

# Evaluation

## F1

In [None]:
from sklearn.metrics import f1_score

# For Support Vector Machine
print(f1_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Decision Tree
print(f1_score(test_y,clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Support Naive Bayes
print(f1_score(test_y,clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# For Logistic Regression
print(f1_score(test_y,clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


## Mean Accuracy

In [None]:
# For Support Vector Machine
print(clf_svm.score(test_seq,test_y))
# For Decision Tree
print(clf_dec.score(test_seq,test_y))
# For Decision Naive Bayes
print(clf_gnb.score(test_seq,test_y))
# For Logistic Regression
print(clf_log.score(test_seq,test_y))

## Precision

In [None]:
# Precision
from sklearn.metrics import precision_score

# For Support Vector Machine
print(precision_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(precision_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(precision_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(precision_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

## Recall

In [None]:
from sklearn.metrics import recall_score

# For Support Vector Machine
print(recall_score(test_y, clf_svm.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Tree
print(recall_score(test_y, clf_dec.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Decision Naive Bayes
print(recall_score(test_y, clf_gnb.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))
# For Logistic Regression
print(recall_score(test_y, clf_log.predict(test_seq),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# VIP : Predict_Proba using Threshold

### Extract Evaluation of Above Threshold SVM

In [None]:
AboveThresholdsvm = [] 
AboveThresholdsvmbrop = []
# [0][1] = Postivie
# [0][0] = Negative

for i in range(len(test_seq)):
  if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdsvm.append(test_text[i])
    AboveThresholdsvmbrop.append(test_labels[i])

print("Number of records above threshold SVM:")
print(len(AboveThresholdsvm))

### Extract Evaluation of Above Threshold Desicion Tree

In [None]:
AboveThresholdDT = [] 
AboveThresholdDTbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_dec.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_dec.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdDT.append(test_text[i])
    AboveThresholdDTbrop.append(test_labels[i])


print("Number of records above threshold Desicion Tree:")
print(len(AboveThresholdDT))

### Extract Evaluation of Above Threshold Naive Bayes

In [None]:
AboveThresholdGNB = [] 
AboveThresholdGNBbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_gnb.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_gnb.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdGNB.append(test_text[i])
    AboveThresholdGNBbrop.append(test_labels[i])


print("Number of records above threshold Logistic Regression:")
print(len(AboveThresholdGNB))

### Extract Evaluation of Above Threshold Logistic Regression

In [None]:
AboveThresholdLR = [] 
AboveThresholdLRbrop = []
# [0][1] = Postivie
# [0][0] = Negative


for i in range(len(test_seq)):
  if clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][0]>0.60:
    AboveThresholdLR.append(test_text[i])
    AboveThresholdLRbrop.append(test_labels[i])


print("Number of records above threshold Logistic Regression:")
print(len(AboveThresholdLR))

In [None]:
print(clf_log.predict_proba(test_seq[2002].reshape(1, -1))[0][1])
print(clf_log.predict(test_seq[2002].reshape(1, -1)))
print(test_labels[2002])

### Tokenize Output above threshod

In [None]:
# Bert
tokens_testsvm = tokenizer.batch_encode_plus(
    AboveThresholdsvm,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqsvm = torch.tensor(tokens_testsvm['input_ids'])
test_masksvm = torch.tensor(tokens_testsvm['attention_mask'])
test_ysvm = torch.tensor(AboveThresholdsvmbrop)

tokens_testDT = tokenizer.batch_encode_plus(
    AboveThresholdDT,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqDT = torch.tensor(tokens_testDT['input_ids'])
test_maskDT = torch.tensor(tokens_testDT['attention_mask'])
test_yDT = torch.tensor(AboveThresholdDTbrop)

tokens_testGNB = tokenizer.batch_encode_plus(
    AboveThresholdGNB,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqGNB = torch.tensor(tokens_testGNB['input_ids'])
test_maskGNB = torch.tensor(tokens_testGNB['attention_mask'])
test_yGNB = torch.tensor(AboveThresholdGNBbrop)

tokens_testLR = tokenizer.batch_encode_plus(
    AboveThresholdLR,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
test_seqLR = torch.tensor(tokens_testLR['input_ids'])
test_maskLR = torch.tensor(tokens_testLR['attention_mask'])
test_yLR = torch.tensor(AboveThresholdLRbrop)



### Evaluation Above Threshold

In [None]:
#F1
print("F1 SVM more than 60% threshold")
print(f1_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("F1 DT more than 60% threshold")
print(f1_score(test_yDT, clf_log.predict(test_seqDT),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("F1 GNB more than 60% threshold")
print(f1_score(test_yGNB, clf_log.predict(test_seqGNB),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("F1 LR more than 60% threshold")
print(f1_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

In [None]:
# Accuracy
# For Support Vector Machine
print("Accuracy SVM more than 60% threshold")
print(clf_svm.score(test_seqsvm,test_ysvm))

print("Accuracy DT more than 60% threshold")
print(clf_dec.score(test_seqDT,test_yDT))

print("Accuracy GNB more than 60% threshold")
print(clf_gnb.score(test_seqGNB,test_yGNB))

print("Accuracy LR more than 60% threshold")
print(clf_log.score(test_seqLR,test_yLR))


In [None]:
# Precision
from sklearn.metrics import precision_score
# For Support Vector Machine
print("Precision SVM more than 60% threshold")
print(precision_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Precision DT more than 60% threshold")
print(precision_score(test_yDT, clf_log.predict(test_seqDT),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Precision GNB more than 60% threshold")
print(precision_score(test_yGNB, clf_log.predict(test_seqGNB),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Precision LR more than 60% threshold")
print(precision_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))


In [None]:
# Recall
from sklearn.metrics import recall_score
print("Recall SVM more than 60% threshold")
print(recall_score(test_ysvm, clf_svm.predict(test_seqsvm),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Recall DT more than 60% threshold")
print(recall_score(test_yDT, clf_log.predict(test_seqDT),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Recall GNB more than 60% threshold")
print(recall_score(test_yGNB, clf_log.predict(test_seqGNB),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

print("Recall LR more than 60% threshold")
print(recall_score(test_yLR, clf_log.predict(test_seqLR),average = None, 
labels=[Propaganda.POSITIVE,Propaganda.NEGATIVE]))

# Bagging

In [None]:
Output=[]
for i in range(len(test_seq)):
  Vote=0
  if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_dec.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_gnb.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  elif clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
    Vote=Vote+1
  if (Vote>=1):
    Output.append("Propaganda")
  else:
    Output.append("nonPropaganda")

# for i in range(len(test_seq)):
#   Vote=0
#   if clf_svm.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_dec.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_gnb.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60 or clf_log.predict_proba(test_seq[i].reshape(1, -1))[0][1]>0.60:
#     Vote=Vote+1
#   if (Vote==1):
#     Output.append("Propaganda")
#   else:
#     Output.append("nonPropaganda")

# print(Output.count(Propaganda))

### Print Sample Output

In [None]:
for i in range(2000,2100):
    print(test[i].SUBJprop, end= '')

print('\n')
for i in range(2000,2100):
    x = clf_svm.predict(test_seq[i].reshape(1, -1))
    print(x, end='')

# print(len(test))
# print(len(test_seq))

# x = clf_svm.predict(test_seq[2].reshape(1, -1))
# print(x)

In [None]:
CountPropaganda=0
for i in range(len(Output)):
    if (Output[i]=="nonPropaganda"):
        CountPropaganda=CountPropaganda+1

print("number of propandas in output:")
print(CountPropaganda)
