# Invite people for the party

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import dataframe_image as dfi
# dfi.export(df, 'dataframe.png')

# Import data

In [3]:
df = pd.read_csv(r"go_processed_text.csv")

In [6]:
df = df[['datetime','tweet_id','text']]

# Fix imported data

In [12]:
df['tokenized'] = df.apply(lambda row: eval(row['tokenized']), axis=1)
df['stemmed'] = df.apply(lambda row: eval(row['tokenized']), axis=1)
df['lemmatized'] = df.apply(lambda row: eval(row['tokenized']), axis=1)
df['processed_text'] = df.apply(lambda row: eval(row['tokenized']), axis=1)

# Importação do NLTK

In [1]:
#import libraries
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wferreira.MPAC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Cleaning

- remoção de caracteres indesejados.

<blockquote><i>Many social media data consists of unstructured text 
data. Unstructured text data contains non-significant 
expressions. This "dirty" data needs to be cleaned for NLP 
work to be done effectively. In addition, non-English 
characters should be cleaned in tweet texts to carry out NLP 
work on English texts. Agrali e Aydin (2021).</i></blockquote>

In [8]:
import re
from bs4 import BeautifulSoup

def removeLinks(instancia):
    return re.sub(r"http\S+", "", instancia).lower()

def removePunctuation(instancia):
    return instancia.replace('.','') \
        .replace(',','') \
        .replace(';','') \
        .replace('-','') \
        .replace(':','') \
        .replace('(','') \
        .replace(')','')

def removeHTMLTags(instancia):
    return BeautifulSoup(instancia, 'html.parser').get_text()

def removeNonLettersAndNumbers(instancia):
    return re.sub(r"[^a-zA-Zà-úÀ-Ú0-9 ]", "", instancia.lower())

def removeSpacesFromCorners(instancia):
    return instancia.strip(" ").strip()

def cleanText(instancia):
    instancia = removeLinks(instancia)
    instancia = removePunctuation(instancia)
    instancia = removeHTMLTags(instancia)
    instancia = removeNonLettersAndNumbers(instancia)
    instancia = removeSpacesFromCorners(instancia)
    return instancia

# Tokenization

- separa o texto por palavras

<blockquote><i>Tokenization is one of the first text normalization 
operations to be implemented. The purpose of this step is to 
divide the text or paragraphs into smaller sections. In this way, 
more accurate transactions and analyzes are made. 
Tokenization can be used in two different ways, either wordbased or sentence based. In this study, a word-based 
tokenization process will be applied. Tokenization has been 
applied to the texts in the "clean_text" column of the dataset. 
The words in the text for each line formed the new "tokenized" 
column as a list. Agrali e Aydin (2021).<i></blockquote>

In [9]:
def tokenizeText(instancia):
    '''
    Recebe o texto e retorna uma lista de tokens.
    '''
    return nltk.tokenize.TweetTokenizer().tokenize(instancia)

# Stemming

- É uma técnica de reduzir uma palavra ao seu radical, mesmo que seu radical não seja válido no seu idioma, removendo prefixos e sufixos de uma palavra.

<blockquote><i>Stemming is applied to remove the inflections (prefix or 
suffix) of words. Words that have the same meaning and 
spelling are evaluated as different words by taking a prefix or 
a suffix. Stemming process is used to prevent this. After the 
tokenization process, the words kept as a list will be converted 
into root words. Agrali e Aydin (2021).</i></blockquote>

In [10]:
nltk.download('rslp')
def stemText(instancia):
    '''
    Recebe uma lista de termos e retorna uma lista stemada.
    '''
    stemmer = nltk.stem.RSLPStemmer()
    words = []
    for w in instancia:
        words.append(stemmer.stem(w))
    return words

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\wferreira.MPAC\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


# Lemmatization

- <p>Reduz as palavras flexionadas adequadamente, garantindo que a palavra raiz pertença ao idioma. Determinando assim a palavra que representa seu lema.<p>
- <p>Por exemplo: <strong>"execuções"</strong> e <strong>"executar"</strong> são formas da palavra <strong>execução</strong>, portanto <strong>execução</strong> é o lema da palavra.</p>

<blockquote><i>Lemmatization is appliedas stemming can sometimes fail 
to find root words. Lemmatization, which considers the 
morphological analysis of words and appropriately separates 
the meaningful word into its roots, can be used as an 
alternative. Lemmatization is a very important method to find 
the smallest root form of a word. In this way, each word 
becomes able to represent itself. Agrali e Aydin (2021).</i></blockquote>

In [2]:
from nltk.stem import WordNetLemmatizer
lematizador = WordNetLemmatizer()

def lemmatizeText(instancia):
    '''
    Recebe uma lista de termos e retorna uma lista lematizada.
    '''
    words = []
    for word in instancia:
        words.append(lematizador.lemmatize(word))
    return words

In [22]:
nltk.WordNetLemmatizer().lemmatize('metavers')

'metavers'

In [17]:
lematizador.lemmatize('haters')

'hater'

# Removing Stopwords

- Palavras que não são relevantes na sentença

<blockquote><i>Remove stopwordsstep is applied afterwards. While 
creating sentences, words that do not mean anything in terms 
of emotions and meanings are used. These words are the most 
common words in a language (such as “the”, “a”, “in”), which 
are usually helpful in sentence construction. These words are 
called stopwords. There are different stopwords for each 
language. In this study, English stopwords are discussed. 
Removing these words from the sentence will not have any 
effect in terms of sentiment analysis in the sentence. Agrali e Aydin (2021).</i></blockquote>

In [12]:
def removeStopWords(instancia):
    '''
    Recebe uma lista de termos e retorna uma lista
    com stopwords removidas.
    '''
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = [i for i in instancia if not i in stopwords]
    return " ".join(words)

# Testing Functions

In [13]:
tqdm.pandas()

In [14]:
def save(filename, data):    
    data.to_csv('{}.csv'.format(filename), index=False)

In [18]:
def preprocessingText(data, filename):
    data['clean_text'] = data.progress_apply(lambda row: cleanText(row['text']), axis=1)
    data['tokenized'] = data.progress_apply(lambda row: tokenizeText(row['clean_text']), axis=1)
    data['stemmed'] = data.progress_apply(lambda row: stemText(row['tokenized']), axis=1)
    data['lemmatized'] = data.progress_apply(lambda row: lemmatizeText(row['stemmed']), axis=1)
    data['processed_text'] = data.progress_apply(lambda row: removeStopWords(row['lemmatized']), axis=1)
    return data

In [19]:
df = preprocessingText(df, 'withemoji')

100%|██████████| 3667775/3667775 [05:46<00:00, 10590.49it/s]
100%|██████████| 3667775/3667775 [12:15<00:00, 4984.38it/s]
100%|██████████| 3667775/3667775 [2:02:16<00:00, 499.94it/s]   
100%|██████████| 3667775/3667775 [56:15<00:00, 1086.68it/s]  
100%|██████████| 3667775/3667775 [57:47<00:00, 1057.83it/s]  


In [30]:
df.head()

Unnamed: 0,datetime,tweet_id,text,clean_text,tokenized,stemmed,lemmatized,processed_text
0,2021-10-01 00:12:02+00:00,1443730389986590720,✨Join me this Sunday to celebrate 6 months sin...,join me this sunday to celebrate 6 months sinc...,"[join, me, this, sunday, to, celebrate, 6, mon...","[join, me, thil, sunday, to, celebrat, 6, mont...","[join, me, thil, sunday, to, celebrat, 6, mont...",join thil sunday celebrat 6 month sinc genesil...
1,2021-10-01 00:13:00+00:00,1443730633360953344,@top7ico @MXCfoundation @DeFinePlatform @YOPfi...,top7ico mxcfoundation defineplatform yopfi o3s...,"[top, 7ico, mxcfoundation, defineplatform, yop...","[top, 7ic, mxcfoundation, defineplatform, yopf...","[top, 7ic, mxcfoundation, defineplatform, yopf...",top 7ic mxcfoundation defineplatform yopf o3sw...
2,2021-10-01 00:13:09+00:00,1443730669419446291,@MetaSpatial_io Good project go to the moon 🚀🚀...,metaspatialio good project go to the moon muha...,"[metaspatialio, good, project, go, to, the, mo...","[metaspatiali, good, project, go, to, the, moo...","[metaspatiali, good, project, go, to, the, moo...",metaspatiali good project go moon muhamad 8065...
3,2021-10-01 00:13:18+00:00,1443730706409033728,@Metaverse_Yin I just take the hype as alpha lol,metaverseyin i just take the hype as alpha lol,"[metaverseyin, i, just, take, the, hype, as, a...","[metaverseyin, i, just, tak, the, hyp, as, alp...","[metaverseyin, i, just, tak, the, hyp, a, alph...",metaverseyin tak hyp alph lol
4,2021-10-01 00:13:56+00:00,1443730867797565442,@Metaverse_Yin Keep it coming ! Lmao,metaverseyin keep it coming lmao,"[metaverseyin, keep, it, coming, lmao]","[metaverseyin, keep, it, coming, lma]","[metaverseyin, keep, it, coming, lma]",metaverseyin keep coming lma


# Counting words

In [5]:
# import CountVectorizer for counting number of times each word occurs
from sklearn.feature_extraction.text import CountVectorizer

#### CountVectorizer module convert a collection of text documents to a matrix of token counts.
<blockquote><i>If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data. Pedregosa et al. (2011). scikit-learn.org</i></blockquote>

https://scikit-learn.org/stable/about.html#citing-scikit-learn

## TODO: DELETE NaN values in the processed_text

In [15]:
# Delete some missing values in the collection
df = df[df.processed_text.isnull() == False]

In [16]:
# Create a matrix which shows the number of times specific terms appear on column text
cv = CountVectorizer()
count_matrix = cv.fit_transform(df.processed_text)

In [42]:
# get the terms
#cv.get_feature_names()

### Bag of words

In [None]:
# array of frequency of terms in each document
count_matrix.toarray()

In [17]:
# create dataframe
word_count = pd.DataFrame(cv.get_feature_names(), columns = ['term'])
# sum the presence of terms and turn it into a list
word_count["count"] = count_matrix.sum(axis=0).tolist()[0]
word_count = word_count.sort_values("count", ascending=False).reset_index(drop=True)
# 200 palavras mais usadas
word_count[:]



Unnamed: 0,term,count
0,metavers,3570138
1,nft,1104533
2,thil,734549
3,project,573580
4,crypt,282796
...,...,...
2007954,faysalnafim,1
2007955,faysalkhan,1
2007956,faysalhossain,1
2007957,faysalchaudary,1


In [77]:
type(count_matrix), count_matrix.shape

(scipy.sparse._csr.csr_matrix, (3667775, 2007959))

In [78]:
# importing pandas library
import pandas as pd
# importing matplotlib library
import matplotlib.pyplot as plt

In [22]:
word_count[word_count['count'] > 100000]

Unnamed: 0,term,count
0,metavers,3570138
1,nft,1104533
2,thil,734549
3,project,573580
4,crypt,282796
5,gam,274057
6,hav,256039
7,new,255632
8,mor,210418
9,lik,199900
