In [None]:
!pip install autokeras




## Import Non-Preprocessed Data

In [None]:
import os

import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files
import pandas as pd

import autokeras as ak


In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df=pd.read_csv('gdrive/My Drive/Licenta/Data/politifact_clean_binarized.csv')
df = df[['statement', 'veracity']]
df.head()

Unnamed: 0,statement,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",0
1,"Says Ronald Reagan said immigrants ""brought wi...",1
2,"Says Democratic Senators ""demand Supreme Court...",0
3,"""Tim Kaine doesn’t want a border at all. He wa...",0
4,"""George H.W. Bush has died at 94.""",0


In [None]:
train_data = df[:-1000]
test_data = df[-1000:]
print(train_data.shape, test_data.shape)

(10188, 2) (1000, 2)


In [None]:
x_train = np.array(train_data.statement)
y_train = np.array(train_data.veracity)
x_test = np.array(test_data.statement)
y_test = np.array(test_data.veracity)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting


(10188,)
(10188,)
Sen. Kamala Harris is "supporting the animals of M


The second step is to run the [TextClassifier](/text_classifier).  As a quick
demo, we set epochs to 2.  You can also leave the epochs unspecified for an
adaptive number of epochs.


## Train and Evaluate on Politifact Binarized Clean

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True, max_trials=2
)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, callbacks=[callback], epochs=3)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))



Trial 3 Complete [04h 26m 24s]
val_loss: 0.6354480981826782

Best val_loss So Far: 0.6354480981826782
Total elapsed time: 04h 30m 22s

Search: Running Trial #4

Hyperparameter    |Value             |Best Value So Far 
text_block_1/bl...|bert              |bert              
classification_...|0                 |0                 
optimizer         |adam_weight_decay |adam_weight_decay 
learning_rate     |2e-05             |2e-05             
text_block_1/ma...|20000             |20000             
text_block_1/be...|512               |512               

Epoch 1/3

KeyboardInterrupt: ignored

## Import Data and Preprocess It 
We want to see if preprocessing improves performance of AutoML models.

In [None]:
!pip install -U spacy
import sys
!{sys.executable} -m spacy download en_core_web_lg

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 296kB/s 
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting thinc<8.1.0,>=8.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/61/87/decceba68a0c6ca356ddcb6aea8b2500e71d9bc187f148aae19b747b7d3c/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 42.3MB/s 
Collecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/b3/0a/52ae1c659fc08f13dd7c0ae07b88e4f807ad83fb9954a59b0b0a3d1a8ab6/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)
[K     |████████████████████████████████| 9.1MB

In [None]:
# Packages for Data Manipulation
import numpy as np
import pandas as pd
import os
from collections import Counter
import spacy

# for NLP
import re
import unicodedata
from bs4 import BeautifulSoup

# to see all the cells and data in the dataframes
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_colwidth', 300) 

# to hide warnings about train/test size for train_test_split
import warnings
warnings.filterwarnings('ignore')


In [None]:
df=pd.read_csv('gdrive/My Drive/Licenta/Data/politifact_clean_binarized.csv')
df = df[['statement', 'veracity']]
df.head()

Unnamed: 0,statement,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",0
1,"Says Ronald Reagan said immigrants ""brought wi...",1
2,"Says Democratic Senators ""demand Supreme Court...",0
3,"""Tim Kaine doesn’t want a border at all. He wa...",0
4,"""George H.W. Bush has died at 94.""",0


In [None]:
# this cell is most useful for imbalanced data
# ratio of true to false in the data set (i.e., our "chance" baseline)

print("true statements: ", round(len(df.veracity[df.veracity == True]),2))
print("false statements: ", round(len(df.veracity[df.veracity == False]),2))

tru_count = len(df.veracity[df.veracity == True])
fal_count = len(df.veracity[df.veracity == False])

print("chance of truth baseline: ", round(tru_count/(tru_count + fal_count),2)) # len(df) as denominator also works
print("chance of false baseline: ", round(fal_count/(tru_count + fal_count),2))

true statements:  4853
false statements:  6335
chance of truth baseline:  0.43
chance of false baseline:  0.57


## Create Normalized Text

In [None]:
nlp = spacy.load('en_core_web_lg')

def strip_html_tags(text):
            soup = BeautifulSoup(text, "html.parser")
            [s.extract() for s in soup(["iframe", "script"])]
            stripped_text = soup.get_text()
            stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
            return stripped_text
        
def remove_accented_characters(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
# lemmatization function
def normalize(text, 
              remove_html = False, 
              lowercase = False,
              lemmatize = False,
              remove_extra_newlines = False,
              remove_stopwords = False,
              remove_non_alphabetic = False,
              remove_accented_char = False,
              ):
        
    # remove html
    if remove_html:
        text = strip_html_tags(text)

    # lowercase all text
    if lowercase:
        text = text.lower()
        
    # lemmatize
    if lemmatize:        
        text = nlp(text)
        text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
        
    # remove extra new lines    
    if remove_extra_newlines:
        text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        stopwords = spacy.lang.en.stop_words.STOP_WORDS
        text = nlp(text)
        text = ' '.join([token.text for token in text if token.text not in stopwords])
    
    # remove non-alphabetic characters
    if remove_non_alphabetic:
        text = nlp(text)
        text = ' '.join([token.text for token in text if token.text.isalpha()])
        
    # remove accented characters
    if remove_accented_char:
        text = remove_accented_characters(text)

    return text

In [None]:
# sample of light normalizer
string = "@ ! i DON'TTT, won't, a féél    can't not USING the  NLP 27x maaaah?"

normalize(string, 
          remove_html = True, 
          lowercase = True,
          remove_extra_newlines = True,
          remove_accented_char = True)

"@ ! i don'ttt, won't, a feel    can't not using the  nlp 27x maaaah?"

In [None]:
df["light_normalization"] = df.statement.apply(lambda x : normalize(x, 
                                                                  remove_html = True, 
                                                                  lowercase = True,
                                                                  remove_extra_newlines = True,
                                                                  remove_accented_char = True))

In [None]:
# sample of full normalizer
normalize(string, 
          remove_html = True, 
          lowercase = True,
          remove_extra_newlines = True,
          remove_accented_char = True,
          lemmatize = True, # note that lemmatize does not work properly if text is not lowercased first
          remove_stopwords = True,
          remove_non_alphabetic = True) 

# non_alphabetic will remove not only non-alphabetic characters, but also blank spaces

'I wo feel use nlp maaaah'

In [None]:
df["full_normalization"] = df.statement.apply(lambda x : normalize(x, 
                                                                    remove_html = True, 
                                                                    lowercase = True,
                                                                    remove_extra_newlines = True,
                                                                    remove_accented_char = True,
                                                                    lemmatize = True,
                                                                    remove_stopwords = True,
                                                                    remove_non_alphabetic = True))

In [None]:
df.head(3)

Unnamed: 0,statement,veracity,light_normalization,full_normalization
0,"Sen. Kamala Harris is ""supporting the animals of MS-13.""",0,"sen. kamala harris is ""supporting the animals of ms-13.""",sen kamala harris support animal
1,"Says Ronald Reagan said immigrants ""brought with them courage and the values of family, work, and freedom. Let us pledge to each other that we can make America great again.""",1,"says ronald reagan said immigrants ""brought with them courage and the values of family, work, and freedom. let us pledge to each other that we can make america great again.""",ronald reagan immigrant bring courage value family work freedom let pledge america great
2,"Says Democratic Senators ""demand Supreme Court nominee not be unduly influenced by U.S. Constitution.""",0,"says democratic senators ""demand supreme court nominee not be unduly influenced by u.s. constitution.""",democratic senator demand supreme court nominee unduly influence constitution


## Training and Evaluating with Light Normalization

In [None]:
train_data = df[:-1000]
test_data = df[-1000:]
print(train_data.shape, test_data.shape)

(10188, 4) (1000, 4)


In [None]:
x_train = np.array(train_data.light_normalization)
y_train = np.array(train_data.veracity)
x_test = np.array(test_data.light_normalization)
y_test = np.array(test_data.veracity)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting

(10188,)
(10188,)
sen. kamala harris is "supporting the animals of m


In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True, max_trials=2
)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, callbacks=[callback], epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))


Trial 2 Complete [00h 00m 52s]
val_loss: 0.6860910058021545

Best val_loss So Far: 0.6473612189292908
Total elapsed time: 00h 02m 45s
INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets
[0.6887917518615723, 0.5839999914169312]


## Training and Evaluating with Full Normalization

In [None]:
train_data = df[:-1000]
test_data = df[-1000:]
print(train_data.shape, test_data.shape)

(10188, 4) (1000, 4)


In [None]:
x_train = np.array(train_data.full_normalization)
y_train = np.array(train_data.veracity)
x_test = np.array(test_data.full_normalization)
y_test = np.array(test_data.veracity)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting

(10188,)
(10188,)
sen kamala harris support animal


In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
# Initialize the text classifier.
clf = ak.TextClassifier(
    overwrite=True, max_trials=2
)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, callbacks=[callback], epochs=3)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

Trial 2 Complete [00h 01m 18s]
val_loss: 0.686138391494751

Best val_loss So Far: 0.6573224663734436
Total elapsed time: 00h 03m 12s
INFO:tensorflow:Oracle triggered exit
Epoch 1/3
Epoch 2/3
Epoch 3/3
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets
[0.8634027242660522, 0.5619999766349792]


In [None]:
df.shape

(11188, 4)

In [None]:
df.head()

Unnamed: 0,statement,veracity,light_normalization,full_normalization
0,"Sen. Kamala Harris is ""supporting the animals of MS-13.""",0,"sen. kamala harris is ""supporting the animals of ms-13.""",sen kamala harris support animal
1,"Says Ronald Reagan said immigrants ""brought with them courage and the values of family, work, and freedom. Let us pledge to each other that we can make America great again.""",1,"says ronald reagan said immigrants ""brought with them courage and the values of family, work, and freedom. let us pledge to each other that we can make america great again.""",ronald reagan immigrant bring courage value family work freedom let pledge america great
2,"Says Democratic Senators ""demand Supreme Court nominee not be unduly influenced by U.S. Constitution.""",0,"says democratic senators ""demand supreme court nominee not be unduly influenced by u.s. constitution.""",democratic senator demand supreme court nominee unduly influence constitution
3,"""Tim Kaine doesn’t want a border at all. He wants to get rid of Immigration and Customs Enforcement and basically the border patrol as well.""",0,"""tim kaine doesnt want a border at all. he wants to get rid of immigration and customs enforcement and basically the border patrol as well.""",tim kaine want border want rid immigration custom enforcement basically border patrol
4,"""George H.W. Bush has died at 94.""",0,"""george h.w. bush has died at 94.""",george bush die


In [None]:
## Save csv so we don't have to repeat the process of normalization
df.to_csv('politifact_binarized_light_full_normalization.csv', index=False)