## Experiments

Here you can organize all the experiments and exploration as you figure out how to collect and analyze your data and build your NLP tool. The experiments you conduct here will contribute to the report/presentation of your project.

Once you've finalized everything, you should then transfer the parts that are necessary for your demo to the code in the `nlp` folder.

In [1]:
# configure matplotlib to print pretty figures 
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 75

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 10, 6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['text.usetex'] = True
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"

  set_matplotlib_formats('pdf', 'png')


In [1]:
import numpy as np
import pandas as pd

true = pd.read_csv('true.csv')
fake = pd.read_csv('fake.csv')

true['label'] = 1
fake['label'] = 0

df_news = pd.concat([fake, true])
df_news = df_news.sample(frac=1)
df_news.reset_index(drop=True, inplace=True)

In [2]:
import spacy
import re
import pycountry
from sklearn.model_selection import train_test_split

# spacy.require_gpu()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

train_data_news, test_data_news, label_train_data_news, label_test_data_news = train_test_split(
    df_news.iloc[:, :-1], 
    df_news.iloc[:, -1:], 
    test_size=0.2, 
    stratify=df_news.iloc[:, -1:])

countries = [country.name for country in pycountry.countries]
# Generate a list of country names

def replace_countries(text):
        for country in countries:
            text = text.replace(country, "country")
        return text

def process_text_batch(docs):
    texts = [replace_countries(doc.replace('\xa0', ' ')) for doc in docs]
    docs = list(nlp.pipe(texts))
    return [[token.lemma_ for token in doc if not token.is_stop] for doc in docs]

def parallel_process_text(data, batch_size=1000):
    processed_data = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        processed_batch = process_text_batch(batch)
        processed_data.extend(processed_batch)
    return processed_data

X_train_text = parallel_process_text(train_data_news['text'].tolist())
X_test_text = parallel_process_text(test_data_news['text'].tolist())

X_train_text_join = [' '.join(doc) for doc in X_train_text]
X_test_text_join = [' '.join(doc) for doc in X_test_text]


In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train_text_join)
X_train_text_seq = tokenizer.texts_to_sequences(X_train_text_join)
X_test_text_seq = tokenizer.texts_to_sequences(X_test_text_join)

In [58]:
import pickle

# Serialize the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import pad_sequences

X_train_text_sq = pad_sequences(X_train_text_seq, maxlen=150)
X_test_text_sq = pad_sequences(X_test_text_seq, maxlen=150)

model_text = keras.Sequential()
model_text.add(layers.Embedding(10000, 128))
model_text.add(layers.GRU(units=96, dropout=0.4, return_sequences=True))
model_text.add(layers.GlobalMaxPooling1D())
model_text.add(layers.Dense(units=96, activation='elu'))
model_text.add(layers.Dropout(rate=0.2))
model_text.add(layers.Dense(units=128, activation='elu'))
model_text.add(layers.Dropout(rate=0.2))
model_text.add(layers.Dense(units=48, activation='elu'))
model_text.add(layers.Dropout(rate=0.4))
model_text.add(layers.Dense(units=96, activation='elu'))
model_text.add(layers.Dropout(rate=0.2))
model_text.add(layers.Dense(units=112, activation='elu'))
model_text.add(layers.Dropout(rate=0.4))
model_text.add(layers.Dense(units=96, activation='elu'))
model_text.add(layers.Dropout(rate=0.2))
model_text.add(layers.Dense(units=32, activation='elu'))
model_text.add(layers.Dropout(rate=0.4))
model_text.add(layers.Dense(units=96, activation='elu'))
model_text.add(layers.Dropout(rate=0.4))
model_text.add(layers.Dense(units=32, activation='relu'))
model_text.add(layers.Dropout(rate=0.3))
model_text.add(layers.Dense(1, activation='sigmoid'))

model_text.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
model_text.fit(X_train_text_sq, label_train_data_news, epochs=4, validation_data=(X_test_text_sq, label_test_data_news))

Epoch 1/4
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.8640 - loss: 0.2513 - val_accuracy: 0.9898 - val_loss: 0.0371
Epoch 2/4
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 27ms/step - accuracy: 0.9912 - loss: 0.0313 - val_accuracy: 0.9883 - val_loss: 0.0327
Epoch 3/4
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 25ms/step - accuracy: 0.9948 - loss: 0.0208 - val_accuracy: 0.9836 - val_loss: 0.0605
Epoch 4/4
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 27ms/step - accuracy: 0.9975 - loss: 0.0117 - val_accuracy: 0.9940 - val_loss: 0.0353


<keras.src.callbacks.history.History at 0x2d27ba83e00>

In [17]:
import pickle
model = pickle.load(open("../nlp/model.pkl", 'rb'))

  trackable.load_own_variables(weights_store.get(inner_path))


ValueError: A total of 1 objects could not be loaded. Example error message for object <GRUCell name=gru_cell, built=True>:

Layer 'gru_cell' expected 3 variables, but received 0 variables during loading. Expected: ['kernel', 'recurrent_kernel', 'bias']

List of objects that could not be loaded:
[<GRUCell name=gru_cell, built=True>]

In [52]:
# Save the model using the SavedModel format
model_text.save('my_model.keras')

# Load the model
model = tf.keras.models.load_model('my_model.keras')


In [53]:
score, acc = model_text.evaluate(X_test_text_sq, label_test_data_news,
                              batch_size=128)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9947 - loss: 0.0301


In [54]:
display(score,acc)

0.03527630493044853

0.9939866662025452

In [60]:
from IPython.display import display, HTML

def process_single_text(doc):
	text = doc.replace('\xa0', ' ')    	
	new_text = replace_countries(text)
	nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
       
	processed_doc = nlp(new_text)
	
	lemmatized_tokens = [token.lemma_ for token in processed_doc if not token.is_stop and not token.is_punct]
	
	clean_text = ' '.join(lemmatized_tokens)

	return clean_text

from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_proba(texts):
    # Preprocess texts (cleaning, lemmatizing)
    processed_texts = texts
    # Tokenize and pad sequences
    sequences = tokenizer.texts_to_sequences(processed_texts)
    padded_sequences = pad_sequences(sequences, maxlen=150)
    # Predict and format for LIME
    predictions = model_text.predict(padded_sequences)
    return np.hstack((1-predictions, predictions))

new_article = "Hong Kong has seen several months of pro-democracy protests - and China appears to be tightening its grip. The protests began in June 2019 over plans - later put on ice, and finally withdrawn in September - that would have allowed extradition from Hong Kong to mainland China. They then spread to reflect wider demands for democratic reform, and an inquiry into alleged police brutality. Now, China is proposing to introduce a new national security law, which critics believe could be used to crack down on rights and political activists. This is not all happening in a vacuum. There's a lot of important context - some of it stretching back decades - that helps explain what is going on. It's important to remember that Hong Kong is significantly different from other Chinese cities. To understand this, you need to look at its history. It was a British colony for more than 150 years - part of it, Hong Kong island, was ceded to the UK after a war in 1842. Later, China also leased the rest of Hong Kong - the New Territories - to the British for 99 years. It became a busy trading port, and its economy took off in the 1950s as it became a manufacturing hub. The territory was also popular with migrants and dissidents fleeing instability, poverty or persecution in mainland China. Then, in the early 1980s, as the deadline for the 99-year-lease approached, Britain and China began talks on the future of Hong Kong - with the communist government in China arguing that all of Hong Kong should be returned to Chinese rule. The two sides signed a treaty in 1984 that would see Hong Kong return to China in 1997, under the principle of 'one country, two systems'. This meant that while becoming part of one country with China, Hong Kong would enjoy 'a high degree of autonomy, except in foreign and defence affairs' for 50 years. As a result, Hong Kong has its own legal system and borders, and rights including freedom of assembly, free speech and freedom of the press are protected.For example, it is one of the few places in Chinese territory where people can commemorate the 1989 Tiananmen Square crackdown, where the military opened fire on unarmed protesters in Beijing. Hong Kong still enjoys freedoms not seen in mainland China - but they are widely thought to be on the decline. Rights groups have accused China of meddling in Hong Kong, citing examples such as legal rulings that have disqualified pro-democracy legislators, and the disappearance of five Hong Kong booksellers, and a tycoon - who all eventually re-emerged in custody in China. There are also accusations that press and academic freedoms have been deteriorating. In March, China effectively expelled several US journalists - but also prohibited them from working in Hong Kong. The public broadcaster RTHK has come under pressure from Hong Kong's government, first for broadcasting an interview with the World Health Organization about Taiwan, and then for targeting police in its satirical news show 'Headliner'. The local examinations body also came under fire for a world history question about relations between Japan and China, with the government demanding the exam question be invalidated. The government said it was a professional, rather than political, decision, but many academics expressed concern. Another sticking point has been democratic reform. Hong Kong's leader, the chief executive, is currently elected by a 1,200-member election committee - a mostly pro-Beijing body chosen by just 6% of eligible voters. Not all the 70 members of the territory's lawmaking body, the Legislative Council, are directly chosen by Hong Kong's voters. Most seats not directly elected are occupied by pro-Beijing lawmakers. In June 2019, protesters took to the streets again, demonstrating against plans to allow extraditions to mainland China. This time, clashes between police and activists became increasingly violent. The bill was halted, and later fully withdrawn, but demonstrations continued for months, with demands for full democracy and an independent inquiry into police actions. In April this year, Hong Kong police arrested 15 of the city's most high-profile pro-democracy activists for taking part in unauthorised assemblies. In May, Hong Kong's police watchdog said it found no significant wrongdoing on the police's part during the 2019 protests - in a report that was criticised by many rights groups and external experts. The street protests have mostly died down during the coronavirus pandemic, although some small demonstrations, including singing protesters in shopping malls, have started again as restrictions are gradually eased. Now, China is proposing to introduce a new national security law in Hong Kong, which could be similar to the one withdrawn in 2003. It says the legislation is 'highly necessary' and would 'safeguard national security in Hong Kong'. However, the new proposal is also controversial because it is expected to circumvent Hong Kong's own law-making processes - leading to accusations that Beijing is undermining Hong Kong's autonomy."

prediction = model_text.predict(x=process_single_text(new_article))

from lime.lime_text import LimeTextExplainer

# Initialize the explainer
explainer = LimeTextExplainer(class_names=["False", "True"])

# Generate explanation
exp = explainer.explain_instance(process_single_text(new_article), predict_proba, num_features=10)

# Display the explanation
# print(exp.as_list())
# exp.show_in_notebook(text=True)

# exp_html = exp.as_html()

# # Define CSS styles to enhance visibility on dark themes
# style = '''
# <style>
#     body, p, li { color: #fff; background-color: white; } /* Adjust text and background colors */
#     .highlight { color: red; } /* Example of custom class adjustments */
# </style>
# '''
# # Concatenate the style with the explanation HTML
# html_output = style + exp_html

# # Display modified HTML in a Jupyter notebook cell or an IPython environment
# display(HTML(html_output))

ValueError: Unrecognized data type: x=country see month pro democracy protest country appear tighten grip protest begin June 2019 plan later ice finally withdraw September allow extradition country mainland country spread reflect wide demand democratic reform inquiry alleged police brutality country propose introduce new national security law critic believe crack right political activist happen vacuum lot important context stretch decade help explain go important remember country significantly different chinese city understand need look history british colony 150 year country island cede UK war 1842 later country lease rest country New Territories British 99 year busy trading port economy take 1950 manufacturing hub territory popular migrant dissident flee instability poverty persecution mainland country early 1980 deadline 99 year lease approach Britain country begin talk future country communist government country argue country return chinese rule side sign treaty 1984 country return country 1997 principle country system mean country country country enjoy high degree autonomy foreign defence affair 50 year result country legal system border right include freedom assembly free speech freedom press protect example place chinese territory people commemorate 1989 Tiananmen Square crackdown military open fire unarmed protester Beijing country enjoy freedom see mainland country widely think decline right group accuse country meddle country cite example legal ruling disqualify pro democracy legislator disappearance country bookseller tycoon eventually emerge custody country accusation press academic freedom deteriorate March country effectively expel journalist prohibit work country public broadcaster RTHK come pressure country government broadcast interview World Health Organization Taiwan target police satirical news Headliner local examination body come fire world history question relation country country government demand exam question invalidated government say professional political decision academic express concern stick point democratic reform country leader chief executive currently elect 1,200 member election committee pro beijing body choose 6 eligible voter 70 member territory lawmaking body Legislative Council directly choose country voter seat directly elect occupy pro beijing lawmaker June 2019 protester take street demonstrate plan allow extradition mainland country time clash police activist increasingly violent bill halt later fully withdraw demonstration continue month demand democracy independent inquiry police action April year country police arrest 15 city high profile pro democracy activist take unauthorised assembly country police watchdog say find significant wrongdoing police 2019 protest report criticise right group external expert street protest die coronavirus pandemic small demonstration include singe protester shopping mall start restriction gradually ease country propose introduce new national security law country similar withdraw 2003 say legislation highly necessary safeguard national security country new proposal controversial expect circumvent country law make process lead accusation Beijing undermine country autonomy (of type <class 'str'>)

In [66]:
fake[fake["text"].str.contains("Hong Kong")]

Unnamed: 0,title,text,subject,date,label
585,Trump’s Rhetoric Just Resulted In A Trillion ...,If Donald Trump wants to take credit for somet...,News,"August 11, 2017",0
3879,Stock Market Literally SHUTS DOWN And Dollar ...,The prospect of Donald Trump winning the presi...,News,"November 8, 2016",0
4380,BUSTED: Donald Trump Screwed Over American St...,Donald Trump claims he will make America grea...,News,"October 3, 2016",0
6758,New Study Offers Hope In Search For Alzheimer...,Alzheimer s Disease is a devastating type of d...,News,"April 22, 2016",0
9083,The DIRTY TRUTH About DACA Recipients…Where Th...,"Yesterday, a second U.S. judge on Tuesday bloc...",politics,15-Feb-18,0
11097,COLLEGE PROFESSOR’S SEVERED TRUMP HEAD Paintin...,A painting on display at a University of Alask...,politics,"Apr 20, 2017",0
13905,HILLARY APPROVED? BILL CLINTON Ditched Secret ...,We all know Bill Clinton is a sexual predator....,politics,"May 13, 2016",0
15477,CHINESE IMMIGRANT OWES MILLIONS FOR SELLING CO...,Counterfeiting is illegal by the way and shoul...,politics,"Jul 11, 2015",0
16866,Want To Know Where Your Meat Comes From? Those...,One word China That s the only reason you shou...,Government News,"Jan 5, 2016",0
17201,CHINESE IMMIGRANT OWES MILLIONS FOR SELLING CO...,Counterfeiting is illegal by the way and shoul...,Government News,"Jul 11, 2015",0


In [7]:
true[true['text'].str.contains("Hong Kong")]["text"]

569      WASHINGTON (Reuters) - The United States has c...
769      WASHINGTON (Reuters) - U.S. Commerce Secretary...
795      WASHINGTON (Reuters) - U.S. Commerce Secretary...
821      WASHINGTON/KHOBAR, Saudi Arabia (Reuters) - U....
1533     HONG KONG (Reuters) - U.S. Commerce Secretary ...
                               ...                        
21176    BANGKOK (Reuters) - Thailand s prestigious Chu...
21184    HANOI (Reuters) - Tensions are high on the Sou...
21257    HONG KONG (Reuters) - Hong Kong businessman an...
21394    HONG KONG (Reuters) - Typhoon Hato, a maximum ...
21398    SHANGHAI (Reuters) - An old review of an acade...
Name: text, Length: 110, dtype: object