# Import All Packages




In [None]:
    import random
    import pandas as pd
    import numpy as np
    import string
    from string import digits
    from sklearn.model_selection import train_test_split
    import torch
    import torch.nn as nn
    from sklearn.metrics import classification_report
    import transformers
    from transformers import AutoModel, BertTokenizerFast
    from ipywidgets import IntProgress
    from tqdm import tqdm
    import nltk
    from nltk.corpus import stopwords
    from spacy.lang.en import English
    import re
    from sklearn.feature_extraction.text import CountVectorizer



# Split train dataset into train, validation and test sets

In [None]:
train = pd.read_excel('Data/trainDataset.xlsx', engine='openpyxl')
test = pd.read_excel('Data/testDataset.xlsx', engine='openpyxl')

In [None]:
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)


In [None]:
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

# First few rows of the training dataset
train.head()

# First few rows of the testing dataset
test.head()

### Missing Values treatment in the dataset

In [None]:
#Missing values in training set
train.isnull().sum()
#Missing values in test set
test.isnull().sum()

### Analysis of the SUBJprop Column

In [None]:
## Propaganda Sentence
print("Propaganda Sentence example :",train[train['SUBJprop']==1]['Sentence'].values[0])
#Non-Propaganda Sentence
print("Non-Propaganda Sentence example :",train[train['SUBJprop']==0]['Sentence'].values[0])

### Distribution of the SUBJprop Column

In [None]:
train['SUBJprop'].value_counts()
# train['SUBJprop'].value_counts(normalize=True)

### Plot Distribution of SUBJprop Column

In [None]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
train['SUBJprop'].value_counts(normalize=True).iplot(kind='bar',
    yTitle='Percentage', 
    linecolor='black', 
    opacity=0.7,
    color='red',
    theme='pearl',
    bargap=0.6,
    gridcolor='white',
    title='Distribution of SUBJprop Column in the training set')

In [None]:
test['SUBJprop'].value_counts(normalize=True).iplot(kind='bar',
    yTitle='Percentage', 
    linecolor='black', 
    opacity=0.7,
    color='red',
    theme='pearl',
    bargap=0.6,
    gridcolor='white',
    title='Distribution  of SUBJprop column in the test set')

### Text Data Preprocessing

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    set(stopwords.words('english'))
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [None]:
# Applying the cleaning function to both test and training datasets
train['text_clean'] = train['Sentence'].apply(str).apply(lambda x: text_preprocessing(x))
test['text_clean'] = test['Sentence'].apply(str).apply(lambda x: text_preprocessing(x))

### Sample Output

In [None]:
train.head()

## Analyzing Text Statistics

In [None]:
train['text_len'] = train['text_clean'].astype(str).apply(len)
train['text_word_count'] = train['text_clean'].apply(lambda x: len(str(x).split()))
train.head(3)

In [None]:
pos = train[train['SUBJprop']==1]
neg = train[train['SUBJprop']==0]

# Sentence length analysis

In [None]:
pos['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Positive Text Length Distribution')

neg['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='green',
    yTitle='count',
    title='Negative Text Length Distribution')

# Text word count analysis

In [None]:
pos['text_word_count'].iplot(
    kind='hist',
    bins=50,
    xTitle='text length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Positive Text word count')

neg['text_word_count'].iplot(
    kind='hist',
    bins=50,
    xTitle='text length',
    linecolor='black',
    color='green',
    yTitle='count',
    title='Negative Text word count')

# Distribution of top unigrams

In [None]:
#source of code : https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
pos_unigrams = get_top_n_words(pos['text_clean'],20)
neg_unigrams = get_top_n_words(neg['text_clean'],20)


#for word, freq in top_unigrams:
    #print(word, freq)
df1 = pd.DataFrame(pos_unigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 1-Word in Propaganda Sentences',orientation='h')

df2 = pd.DataFrame(neg_unigrams, columns = ['Text' , 'count'])
df2.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black', color='green',title='Top 20 1-Word in Non-Propaganda Sentences',orientation='h')


### Distribution of top Bigrams

In [None]:
def get_top_n_gram(corpus,ngram_range,n=None):
    vec = CountVectorizer(ngram_range=ngram_range,stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
pos_bigrams = get_top_n_gram(pos['text_clean'],(2,2),20)
neg_bigrams = get_top_n_gram(neg['text_clean'],(2,2),20)

#for word, freq in top_bigrams:
    #print(word, freq)
df1 = pd.DataFrame(pos_bigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 2-Word in Propaganda Sentences',orientation='h')

df2 = pd.DataFrame(neg_bigrams, columns = ['Text' , 'count'])
df2.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black', color='green',title='Top 20 2-Word in Non-Propaganda Sentences',orientation='h')

# Distribution of top Trigrams 

In [None]:
pos_trigrams = get_top_n_gram(pos['text_clean'],(3,3),20)
neg_trigrams = get_top_n_gram(neg['text_clean'],(3,3),20)

df1 = pd.DataFrame(pos_trigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 Trigrams in positve text',orientation='h')

df2 = pd.DataFrame(neg_trigrams, columns = ['Text' , 'count'])
df2.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black', color='green',title='Top 20 Trigrams in negative text',orientation='h')

