# Part 1: Text Processing
### 29/10/21

#### Vladimir Trukhaev & Ingrid Sancho

In [1]:
#imports 
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import json

In [2]:
#updating/downloading stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#reading data
doc = "dataset_tweets_WHO.txt"
with open(doc, 'r') as file:
    data = json.load(file)
    
#initializing dictionary "my_dict" where value is the tweet text and key its id
keylist = []
for key in data:
    keylist.append(key)


my_dict = {}
    
for i in keylist:
    my_dict[i] = None
    
for key in data:
    tweet = []
    for i in data[key]["full_text"]:
        tweet.append(i)
    tweet1 = "".join(tweet)
    my_dict[key] = tweet1

In [4]:
def lowering(d):
    """
    Transforming tweet text (values in dictionary) in lowercase
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets transformed to lowercase as values
    """
    for key in d:
        d[key] = d[key].lower()
    return d


In [5]:
def cleaning(d):
    """
    Removing anything that is not alphanumeric
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets without any non alphanumeric character
    """
    for key in d:
        d[key] = ["".join(re.sub(r'[^A-Za-z0-9 #]', ' ', i) for i in d[key])]
    return d

In [6]:
def tokenize(d):
    """
    Tokenizing the tweets, in other words, splitting text by "words"
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values
    """
    for key in d:
        for sentence in d[key]:
            d[key] = sentence.split()
    return d

In [7]:
#removing stop words
def stpwords(d):
    """
    Removing stopwords, which are very common words that do not contain meaning
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now with no stopwords
    """
    stop_words = set(stopwords.words("english"))
    for key in my_dict:
        my_dict[key] = [word for word in my_dict[key] if word not in stop_words]
    stop_words = set(stopwords.words("spanish"))
    for key in my_dict:
        my_dict[key] = [word for word in my_dict[key] if word not in stop_words]
    stop_words = set(stopwords.words("french"))
    for key in my_dict:
        my_dict[key] = [word for word in my_dict[key] if word not in stop_words]
    return d

In [8]:
#stemming tweets
def stemming(d):
    """
    Stemming tweets, which means to keep only the "root" of each word
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now stemmed words
    """
    stemmer = PorterStemmer()
    for key in my_dict.keys():
        my_dict[key] = [stemmer.stem(word) for word in my_dict[key]]
    return d

In [9]:
#running every function for our dictionary of tweets "my_dict"
my_dict = lowering(my_dict)
my_dict = cleaning(my_dict)
my_dict = tokenize(my_dict)
my_dict = stpwords(my_dict)
my_dict = stemming(my_dict)

In [10]:
print(my_dict)

{'0': ['intern', 'day', 'disast', 'risk', 'reduct', '#openwho', 'launch', 'multi', 'tier', 'core', 'curriculum', 'help', 'equip', 'compet', 'need', 'work', 'within', 'public', 'health', 'emerg', 'respons', 'start', 'learn', 'today', 'amp', '#ready4respons', 'http', 'co', 'hbffof0xkl', 'http', 'co', 'fgzy22rwu'], '1': ['#covid19', 'shown', 'health', 'emerg', 'disast', 'affect', 'entir', 'commun', 'especi', 'weak', 'health', 'system', 'vulner', 'popul', 'like', 'migrant', 'indigen', 'peopl', 'live', 'fragil', 'humanitarian', 'condit', 'http', 'co', 'jpuqpnu0v1'], '2': ['intern', 'day', 'disast', 'risk', 'reduct', 'better', 'respond', 'emerg', 'countri', 'must', 'invest', 'health', 'care', 'system', 'achiev', 'gender', 'equiti', 'protect', 'marginalis', 'group', 'ensur', 'readi', 'amp', 'equit', 'access', 'suppli', 'strong', 'amp', 'resili', 'health', 'system', 'http', 'co', '5nalyjiymp'], '3': ['rt', 'whoafro', 'congratul', 'algeria', '#algeria', '16th', 'countri', '#africa', 'reach', 'm