In [49]:
# Importando as bibliotecas que iremos utilizar:
import nltk
import re
import pandas as pd
import string
import os
import json

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('rslp')
nltk.download('wordnet')

#Nome da pasta tema
pasta_tema = "BOT"

#Nome do arquivo
nome_arquivo = "steam_chat_bot"

#Caminho do arquivo
path_arquivo = f"./ChatRooms/{pasta_tema}/{nome_arquivo}"

[nltk_data] Downloading package stopwords to /home/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/victor/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package wordnet to /home/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
#Criar um arquivo csv a partir do JSON de dados
with open(f"{path_arquivo}.json", encoding='utf-8-sig') as f_input:
    df = pd.read_json(f_input)

df.to_csv(f"{path_arquivo}.csv", encoding='utf-8', index=False)

In [51]:
# Visualizando os dados:
df = pd.read_csv(f"{path_arquivo}.csv", usecols = ['id','text', 'sent', 'fromUser'], encoding='utf-8')
df

Unnamed: 0,id,text,sent,fromUser
0,55964bdaaf82937012f5e2de,,2015-07-03T08:46:18.845Z,"{'id': '55964ab515522ed4b3e34c59', 'username':..."
1,55964fa4fcbe8872682ed373,,2015-07-03T09:02:28.284Z,"{'id': '55964ab515522ed4b3e34c59', 'username':..."
2,55bbb824a0587bc54d68df1d,I'm having trouble with 3. Edit the config fil...,2015-07-31T18:02:12.916Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
3,55bbb82f8deffbc44d8dfb01,can't seem to find the config file anywhere,2015-07-31T18:02:23.528Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
4,55bbb8a843481e5337600d4c,,2015-07-31T18:04:24.085Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
5,55bbbaa18deffbc44d8dfb83,"I apologize, got it to work",2015-07-31T18:12:49.466Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
6,55be2a74dcea60b379fefbc5,\t// Say something when a user joins chat\n\t{...,2015-08-02T14:34:28.192Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
7,55be2a8f8deffbc44d8e25a0,"multiple IDs for welcoming doesn't work, any h...",2015-08-02T14:34:55.948Z,"{'id': '55bb51ff0fc9f982beaba7f7', 'username':..."
8,55c6c8d121801cd866ca67da,@kanyawest you could use doormat trigger inste...,2015-08-09T03:28:17.556Z,"{'id': '55964bf415522ed4b3e34c69', 'username':..."
9,55c6c8ff2ee3da6275c33fb3,"also if anyone needs me in the future, just ad...",2015-08-09T03:29:03.235Z,"{'id': '55964bf415522ed4b3e34c69', 'username':..."


In [52]:
# Removendo os valores duplicados:
# df.drop_duplicates(subset=['text'], inplace=True)

#Renomeia o nome da coluna fromUser para username
df = df.rename(columns=({'fromUser':'username'}))

In [53]:
#Remove pontuação
def RemovePunctuation(instancia):
    palavras = []
    table = str.maketrans("", "", string.punctuation)
    for w in instancia.split():
        palavras.append(w.translate(table))
    return (" ".join(palavras))

# Função para remover Stopwords da nossa base:
def RemoveStopWords(instancia):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palavras = [i for i in instancia.split() if not i in stopwords]
    return (" ".join(palavras))

#Stemming é a técnica de remover sufixos e prefixos de uma palavra, chamada stem.
#Por exemplo, o stem da palavra cooking é cook. Um bom algoritmo sabe que “ing” é um sufixo e pode ser removido
def Stemming(instancia):
    stemmer = nltk.stem.RSLPStemmer()
    palavras = []
    for w in instancia.split():
        palavras.append(stemmer.stem(w))
    return (" ".join(palavras))

#Vamos remover as pontuações e os links, pois eles não adiciona nenhuma informação extra.
def Limpeza_dados(instancia):
    #Transforma tudo em string
    instancia = re.sub("[^a-zA-Z]", " ", str(instancia))
    
    #Remove os números 
    instancia = re.sub(r"\d+", "", instancia)
    
    #Remove links, pontos, virgulas,ponto e virgulas dos tweets
    #instancia = re.sub(r"http\S+", "", instancia).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','')
    return (instancia)

#Reduz as palavras flexionadas adequadamente, garantindo que a palavra raiz pertença ao idioma.
wordnet_lemmatizer = WordNetLemmatizer()
def Lemmatization(instancia):
    palavras = []
    for w in instancia.split():
        palavras.append(wordnet_lemmatizer.lemmatize(w))
    return (" ".join(palavras))

In [54]:
#Função principal
def Preprocessing(instancia):
    instancia = Limpeza_dados(instancia)
    palavras = RemoveStopWords(instancia)
    #palavras = Stemming(palavras)
    palavras = Lemmatization(palavras)
    #palavras = RemovePunctuation(palavras)
    
    return palavras

In [55]:
#Pega o json do usuário e retorna somente o username.
def getUserObject(user_string):
    user_array = user_string.split(",")
    user_string = "{" + user_array[1] + "}"
    
    #Se o texto interno possuir aspas dupla, iremos colocar o \"
    #user_string = user_string.replace('"', '\\"')
    
    #Iremos substituir todas as ocorrências de aspas simples por aspas dupla
    user_string = user_string.replace('\'', '"')
    
    if user_string != "":
        #Transforma a string em json
        user = json.loads(user_string)
        username = user['username']
        
    return username


In [56]:
#Função para remover as mensagens adjacentes do mesmo usuário
def concatMessageSameUser(df):
    index_remove = []
    
    #Percorre todas as mensagens
    for i in range(len(df) - 1):
        #Verifica se é o mesmo usuário da mensagem posterior
        if df['username'][i] == df['username'][i + 1]:
            #Concatena a mensagem
            df['text'][i] = str(df['text'][i]) + ' ' + str(df['text'][i + 1])
            
            #Guarda o número da linha para remover
            index_remove.append(i + 1)
        
    #Apaga as linhas
    df.drop(df.index[index_remove], inplace=True)
            
    return df


In [57]:
# Separando username:
username = df['username']

# Busca apenas os nomes dos usuários
df['username'] = [getUserObject(i) for i in username]

# Busca as mensagens adjacentes do mesmo usuário e concatena 
# df = concatMessageSameUser(df)

In [58]:
# Separando username:
messages = df['text']

# Aplica a função em todos os dados:
messages = [Preprocessing(i) for i in messages]

#Insere a coluna clean com as mensagens pré processadas
df.insert(2, "clean", messages)

#Irá remover as rows com valores vazios ou 'nan'
df.replace({'clean' : { '' : float("NaN"), 'nan' : float("NaN")}}, inplace=True)
df.dropna(subset=['clean'])

Unnamed: 0,id,text,clean,sent,username
2,55bbb824a0587bc54d68df1d,I'm having trouble with 3. Edit the config fil...,I trouble Edit config file node module steam c...,2015-07-31T18:02:12.916Z,kanyawest
3,55bbb82f8deffbc44d8dfb01,can't seem to find the config file anywhere,seem find config file anywhere,2015-07-31T18:02:23.528Z,kanyawest
5,55bbbaa18deffbc44d8dfb83,"I apologize, got it to work",I apologize got work,2015-07-31T18:12:49.466Z,kanyawest
6,55be2a74dcea60b379fefbc5,\t// Say something when a user joins chat\n\t{...,Say something user join chat name SteveHoltEnt...,2015-08-02T14:34:28.192Z,kanyawest
7,55be2a8f8deffbc44d8e25a0,"multiple IDs for welcoming doesn't work, any h...",multiple IDs welcoming work help It work one,2015-08-02T14:34:55.948Z,kanyawest
8,55c6c8d121801cd866ca67da,@kanyawest you could use doormat trigger inste...,kanyawest could use doormat trigger instead se...,2015-08-09T03:28:17.556Z,dragonbanshee
9,55c6c8ff2ee3da6275c33fb3,"also if anyone needs me in the future, just ad...",also anyone need future add dragonbanshee mess...,2015-08-09T03:29:03.235Z,dragonbanshee
10,55c6cb5daac97ada66dd2dfd,"Looking closer at it, it isn't coded to suppor...",Looking closer coded support multiple user To ...,2015-08-09T03:39:09.676Z,dragonbanshee
11,55c8b7f78f067d637598c061,"thanks for the reply, it worked",thanks reply worked,2015-08-10T14:40:55.667Z,kanyawest
12,55e3906b8a8b32aa29a30c44,Hello?,Hello,2015-08-30T23:23:23.583Z,Ahmedstien


In [59]:
#Salva o csv novamente com os dados formatados
df.to_csv(f"{path_arquivo}_threads_pre_processado.csv", encoding='utf-8', index=False)

In [60]:
#Mover os arquivos para uma pasta específica
if not os.path.exists(f"{path_arquivo}"):
    os.makedirs(f"{path_arquivo}")
    
os.replace(f"{path_arquivo}.csv", f"{path_arquivo}/{nome_arquivo}.csv")
os.replace(f"{path_arquivo}_threads_pre_processado.csv", f"{path_arquivo}/{nome_arquivo}_threads_pre_processado.csv")