In [1]:
import json
import string
import random

import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/devdev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/devdev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# used a dictionary to represent an intents JSON file
data = {
    "intents": [
        {
            "tag": "greeting",
            "patterns": [
                "Hello",
                "How are you?",
                "Hi there",
                "Hi",
                "Whats up"],
            "responses": [
                "Howdy Partner!",
                "Hello",
                "How are you doing?",
                "Greetings!",
                "How do you do?",
            ],
        },
        {
            "tag": "age",
            "patterns": [
                "how old are you?",
                "when is your birthday?",
                "when was you born?",
            ],
            "responses": [
                "I am 24 years old",
                "I was born in 1996",
                "My birthday is July 3rd and I was born in 1996",
                "03/07/1996",
            ],
        },
        {
            "tag": "date",
            "patterns": [
                "what are you doing this weekend?",
                "do you want to hang out some time?",
                "what are your plans for this week",
            ],
            "responses": [
                "I am available all week",
                "I don't have any plans",
                "I am not busy",
            ],
        },
        {
            "tag": "name",
            "patterns": [
                "what's your name?",
                "what are you called?",
                "who are you?"],
            "responses": ["My name is Kippi", "I'm Kippi", "Kippi"],
        },
        {
            "tag": "goodbye",
            "patterns": ["bye", "g2g", "see ya", "adios", "cya"],
            "responses": [
                "It was nice speaking to you",
                "See you later",
                "Speak soon!",
            ],
        },
    ]
}

In [3]:
# initializing lemmatizer to get stem of words
lemmatizer = WordNetLemmatizer()
# Each list to create
words = []
classes = []
doc_X = []
doc_y = []
# Loop through all the intents
# tokenize each pattern and append tokens to words, the patterns and
# the associated tag to their associated list
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        tokens = nltk.word_tokenize(pattern)
        words.extend(tokens)
        doc_X.append(pattern)
        doc_y.append(intent["tag"])

    # add the tag to the classes if it's not there already
    if intent["tag"] not in classes:
        classes.append(intent["tag"])
# lemmatize all the words in the vocab and convert them to lowercase
# if the words don't appear in punctuation
words = [
    lemmatizer.lemmatize(word.lower())
    for word in words
    if word not in string.punctuation
]
# sorting the vocab and classes in alphabetical order and taking the
# # set to ensure no duplicates occur
words = sorted(set(words))
classes = sorted(set(classes))

In [4]:
# list for training data
training = []
out_empty = [0] * len(classes)
# creating the bag of words model
for idx, doc in enumerate(doc_X):
    bow = []
    text = lemmatizer.lemmatize(doc.lower())
    for word in words:
        bow.append(1) if word in text else bow.append(0)
    # mark the index of class that the current pattern is associated
    # to
    output_row = list(out_empty)
    output_row[classes.index(doc_y[idx])] = 1
    # add the one hot encoded BoW and associated classes to training 
    training.append([bow, output_row])
# shuffle the data and convert it to an array
random.shuffle(training)
training = np.array(training, dtype=object)
# split the features and target labels
train_X = np.array(list(training[:, 0]))
train_y = np.array(list(training[:, 1]))

# defining some parameters
input_shape = (len(train_X[0]),)
output_shape = len(train_y[0])
epochs = 200
# the deep learning model
model = Sequential()
model.add(Dense(128, input_shape=input_shape, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(output_shape, activation = "softmax"))
adam = tf.keras.optimizers.Adam(learning_rate=0.01, decay=1e-6)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=["accuracy"])
print(model.summary())
model.fit(x=train_X, y=train_y, epochs=200, verbose=1)

2021-09-09 12:20:40.131676: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-09 12:20:40.133004: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2021-09-09 12:20:40.269726: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-09 12:20:40.290619: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               5120      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 13,701
Trainable params: 13,701
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch

Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f3d1c68b4f0>

In [5]:
# defining some parameters
input_shape = (len(train_X[0]),)
output_shape = len(train_y[0])
epochs = 200
# the deep learning model
model = Sequential()
model.add(Dense(128, input_shape=input_shape, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(output_shape, activation = "softmax"))
adam = tf.keras.optimizers.Adam(learning_rate=0.01, decay=1e-6)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=["accuracy"])
print(model.summary())
model.fit(x=train_X, y=train_y, epochs=200, verbose=1)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 128)               5120      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 325       
Total params: 13,701
Trainable params: 13,701
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epo

Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f3d1c03dc70>

In [6]:
def clean_text(text): 
  tokens = nltk.word_tokenize(text)
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return tokens

def bag_of_words(text, vocab): 
  tokens = clean_text(text)
  bow = [0] * len(vocab)
  for w in tokens: 
    for idx, word in enumerate(vocab):
      if word == w: 
        bow[idx] = 1
  return np.array(bow)

def pred_class(text, vocab, labels): 
  bow = bag_of_words(text, vocab)
  result = model.predict(np.array([bow]))[0]
  thresh = 0.2
  y_pred = [[idx, res] for idx, res in enumerate(result) if res > thresh]

  y_pred.sort(key=lambda x: x[1], reverse=True)
  return_list = []
  for r in y_pred:
    return_list.append(labels[r[0]])
  return return_list

def get_response(intents_list, intents_json): 
  tag = intents_list[0]
  list_of_intents = intents_json["intents"]
  for i in list_of_intents: 
    if i["tag"] == tag:
      result = random.choice(i["responses"])
      break
  return result

In [None]:
# running the chatbot
while True:
    message = input("")
    intents = pred_class(message, words, classes)
    result = get_response(intents, data)
    print(result)

"Hello!"
Howdy Partner!
How are you?
I'm Kippi
Do you like Saulo?
How are you doing?
I'm fine, how about you?
Greetings!
We still have work to do I see
Speak soon!


In [5]:
json_files = json.loads("./resources/pai_careca.json")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [6]:
!pwd

/home/devdev/Algorithms/Diversao/basic_chatbot/basic_chatbot


In [21]:
with open("/home/devdev/Algorithms/Diversao/basic_chatbot/basic_chatbot/resources/result.json") as f:
    json_text = f.read()


'{\n "about": "Aqui estão os dados que você requisitou. Lembre-se: o Telegram não tem anúncios, não usamos seus dados para oferecer anúncios, não os vendemos para outros, e nem somos parte de nenhuma \\"'

In [22]:
json_text[:500]

'{\n "about": "Aqui estão os dados que você requisitou. Lembre-se: o Telegram não tem anúncios, não usamos seus dados para oferecer anúncios, não os vendemos para outros, e nem somos parte de nenhuma \\"família de empresas\\". O Telegram só mantém as informações que são necessárias para funcionar como um serviço em nuvem completo.\\n\\nVerifique Configurações > Privacidade e Segurança em seu dispositivo móvel para as opções relevantes.",\n "personal_information": {\n  "user_id": 692170770,\n  "first_name'

In [23]:
len(json_text)

37621000

In [25]:
json_files = json.loads(json_text)

In [26]:
type(json_files)

dict

In [29]:
json_files.keys()

dict_keys(['about', 'personal_information', 'profile_pictures', 'contacts', 'frequent_contacts', 'sessions', 'web_sessions', 'other_data', 'chats', 'left_chats'])

In [31]:
type(json_files['chats'])

dict

In [32]:
json_files['chats'].keys()

dict_keys(['about', 'list'])

In [35]:
print(len(json_files['chats']['list']))

6031


In [39]:
mesagens_telegram = json_files['chats']['list']

In [40]:
with open('mensagens_telegram_amostra.json', 'w') as f:
    json.dump(mesagens_telegram, f)

In [47]:
mesagens_telegram[0]['messages'][0]['text']

'Preciso urgente de orientaçao preciso de uma senha'

In [48]:
import pandas as pd

ModuleNotFoundError: No module named 'pandas'

In [49]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 233 kB/s eta 0:00:01
Installing collected packages: pandas
Successfully installed pandas-1.3.3


In [51]:
import pandas as pd

In [56]:
mensagens_df = pd.DataFrame.from_dict(mesagens_telegram[0]['messages'])

In [57]:
mensagens_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 796 entries, 0 to 795
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   796 non-null    int64  
 1   type                 796 non-null    object 
 2   date                 796 non-null    object 
 3   from                 796 non-null    object 
 4   from_id              796 non-null    object 
 5   text                 796 non-null    object 
 6   forwarded_from       154 non-null    object 
 7   photo                90 non-null     object 
 8   width                90 non-null     float64
 9   height               90 non-null     float64
 10  reply_to_message_id  27 non-null     float64
 11  edited               1 non-null      object 
dtypes: float64(3), int64(1), object(8)
memory usage: 74.8+ KB


In [58]:
mensagens_df.head(10)

Unnamed: 0,id,type,date,from,from_id,text,forwarded_from,photo,width,height,reply_to_message_id,edited
0,12279,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao preciso de uma s...,,,,,,
1,12280,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao pois nao consigo...,Kelly Assis,chats/chat_0001/photos/photo_1@01-11-2019_16-5...,1280.0,720.0,,
2,12281,message,2019-11-01T17:12:23,Kelly Assis,user980274329,,Kelly Assis,chats/chat_0001/photos/photo_2@01-11-2019_17-1...,720.0,1280.0,,
3,12282,message,2019-11-01T17:12:23,Kelly Assis,user980274329,Fala q a senha esta bloqueada,Kelly Assis,,,,,
4,12285,message,2019-11-01T17:18:30,Kelly Assis,user980274329,Se for ai qual setor posso proceder e o que de...,,,,,,
5,12286,message,2019-11-01T17:18:57,Kelly Assis,user980274329,Qual documento deve levar ai?,,,,,,
6,12324,message,2019-11-04T09:32:10,Coordenação Renach,user692170770,"Bom dia, operador é com a CAR",,,,,,
7,14400,message,2019-11-18T12:45:08,Kelly Assis,user980274329,Boa tarde como faço para cancelar processo abe...,,,,,,
8,14401,message,2019-11-18T12:45:33,Coordenação Renach,user692170770,tem que enviar ofício de alteração de empresa ...,,,,,,
9,14558,message,2019-11-18T21:46:47,Kelly Assis,user980274329,Se o condutor ficar reprovado na prova de atua...,,,,,,


In [63]:
for usuario in mesagens_telegram:
    partial_mensagens_df = pd.DataFrame.from_dict(usuario['messages'])
    mensagens_df = mensagens_df.append(partial_mensagens_df, ignore_index=True)
    print(mensagens['name'], mensagens_df.shape)
mensagens_df

Kelly (1592, 12)
Karoliny (1935, 12)
Informativos RENACH (1983, 15)
Cauh (1985, 18)
Arielson (2127, 18)
Gil (2159, 20)
Ligia Garcia (2163, 20)
CFC BRAGATTO (2355, 20)
Bru (2396, 20)
Clínica Unitran (2519, 20)
Christopher (2529, 20)
Mayke (2571, 20)
Igor (2578, 20)
Laiza (2588, 20)
Victor (2602, 20)
Elenick (2612, 20)
Fernando (2734, 20)
David (2766, 20)
Cleide (2933, 20)
Jacqueline (3038, 20)
Luan (3060, 20)
Cfc (3161, 20)
Enildo (3221, 20)
allan (3237, 20)
Cristiane (3909, 20)
Rogério (3988, 20)
Camila (4047, 20)
Saulo (4059, 20)
Demy Santos (4065, 20)
Paolla (4077, 20)
CFC RAÇA MAIS (4095, 20)
Vinicius (4109, 20)
CATEDRAL (4113, 20)
Roberta (4122, 21)
Marcylia (4137, 21)
glauber (4660, 21)
Clarisse (4662, 21)
Sindetran (5168, 21)
CFC TOP CAR (5198, 21)
Habilitar (5417, 21)
Lady Laura (5429, 21)
Elaine (5444, 21)
Tataiane (5457, 21)
Oscar (5463, 21)
CFC Siga Em Frente (5569, 21)
Luiz Carlos (5589, 21)
Edirlei Oliveira (5614, 21)
CFC (5772, 21)
CFC MAXCAR (5893, 21)
Antonio (5963, 21)


Lionildo (36651, 23)
Michela (36656, 23)
Jhordan (36659, 23)
Cfc (36890, 23)
Felipe (36891, 23)
Nelsileia (36892, 23)
Flaviani (36915, 23)
Destak Treinamentos (36926, 23)
Ketoren (36938, 23)
Alex (37002, 23)
Camila (37037, 23)
Maicon (37073, 23)
Gabriel (37089, 23)
Medtran (37511, 23)
Werick (37517, 23)
Alexandre (37528, 23)
Gabriel (37540, 23)
Cristiane Lopes (37567, 23)
Auto Escola (37623, 23)
Claudia (37688, 23)
Daniel Barbieri (37704, 23)
Auto Escola (37720, 23)
CFC O Condutor 2 (37743, 23)
Mariana (37754, 23)
Isaías (37767, 23)
Bernardo (37780, 23)
Jordana (37909, 23)
CFC São Cristóvão (38142, 23)
Carla Assis (38145, 23)
Melina (38150, 23)
Breno (38168, 23)
Rossana (38177, 23)
Renato (38206, 23)
Jaqueline (38213, 23)
Juliano (38264, 23)
Jamir (38296, 23)
Thales (38300, 23)
Paulo Herbert Domingos (38307, 23)
Alba (38309, 23)
Nathalia (38313, 23)
Mateus (38317, 23)
Sergio (38327, 23)
Cfc klippel (38344, 23)
PakPak (38354, 23)
Victória (38360, 23)
Coralina (38361, 23)
João Eduardo (3

Luana (54971, 23)
Mark (54981, 23)
Thadeu (54991, 23)
Silvanei (54995, 23)
Cleber (55025, 23)
Higor (55051, 23)
Josiane (55060, 23)
Henrique (55065, 23)
Salome (55098, 23)
Joaquim (55121, 23)
Simone (55128, 23)
Giovanna (55133, 23)
Rosieli (55134, 23)
Climatizadores (55144, 23)
Vinicius (55160, 23)
Arco-íris calçados Itabatã ! (55172, 23)
Venita (55181, 23)
Lucas (55189, 23)
Kel (55200, 23)
Julia (55205, 23)
Cidi (55218, 23)
Conadiz (55219, 23)
Matheus (55227, 23)
Caique Pinto (55237, 23)
Leonardo (55256, 23)
Bretas (55259, 23)
Carina (55267, 23)
RENOVE CNH (55356, 23)
Juliane (55370, 23)
Ana Paula (55378, 23)
Leandro (55398, 23)
Camila (55405, 23)
Erick (55410, 23)
Girlaine (55413, 23)
Josimar (55416, 23)
Rafael (55419, 23)
Milena (55464, 23)
Micael (55471, 23)
Joseniasdicastro (55474, 23)
CFC Edutran Filial (55491, 23)
Roger (55501, 23)
Enilda (55503, 23)
Edismar (55516, 23)
Mirian Raquel (55524, 23)
Maria Carolina (55530, 23)
Kamila (55538, 23)
Maria Auxiliadora (55539, 23)
Valéria 

Cfc Mika (66039, 23)
Bianca (66048, 23)
Allan (66061, 23)
Raphael (66065, 23)
Vet (adriana casali (66077, 23)
Paulo (66090, 23)
Janille (66106, 23)
Mylena Marques (66108, 23)
Carlos Augusto (66112, 23)
Alan (66125, 23)
Nayhana (66128, 23)
Franciele (66132, 23)
Luiz (66163, 23)
Iasmin (66167, 23)
Wellington (66168, 23)
Infomaster (66169, 23)
CFC Gálatas (66322, 23)
Ygor (66323, 23)
Edson (66332, 23)
Ana (66334, 23)
ÍTALO BRASILEIRO (66367, 23)
Alessandra (66390, 23)
Marcela (66400, 23)
Robson (66422, 23)
Edite (66434, 23)
Nalryd (66447, 23)
Francislaine (66490, 23)
Lucas Borghi (66517, 23)
Eduardo (66523, 23)
Poliana (66546, 23)
Patrick Richard (66549, 23)
Vitória (66878, 23)
Nathan (66888, 23)
Lucas (66889, 23)
Autoescola (66941, 23)
Rayane (66943, 23)
Jefinho (66960, 23)
Jessyca (66964, 23)
Fabio (66966, 23)
giulia (66971, 23)
ALESSANDRA (66974, 23)
Rimelc (66978, 23)
Allana (67006, 23)
Kesia (67010, 23)
FILP (67015, 23)
Roberto Cristofer (67027, 23)
Ludmila (67028, 23)
Nathália (6702

Jean cley (76000, 23)
HUDSON FALCÃO (76028, 23)
Leonardo (76040, 23)
Roberto (76047, 23)
Patricia (76050, 23)
Matheus (76054, 23)
Jhonata Ramos (76057, 23)
Opção Netimoveis (76058, 23)
Fábio (76061, 23)
Autoescola (76133, 23)
Raquiella (76141, 23)
Calli (76144, 23)
Vinicius (76151, 23)
Andreia (76185, 23)
Thallison (76186, 23)
Jaqueline (76187, 23)
Leonice Monteiro Dias Rocha (76207, 23)
Maciel (76210, 23)
Riotec (76230, 23)
Eduardo (76244, 23)
Rosalia (76260, 23)
Elisângela Souza (76280, 23)
Gilberto Dias (76290, 23)
Márcio (76297, 23)
Thauan (76311, 23)
Tiago (76315, 23)
David (76324, 23)
Eliabe (76345, 23)
Zani (76353, 23)
Alê (76359, 23)
Douglas (76416, 23)
Autoclin Colatina (76486, 23)
Alberto (76496, 23)
Lilica (76500, 23)
Eli (76504, 23)
Amanda (76510, 23)
Carolina (76515, 23)
Douglas (76521, 23)
Adrielly (76529, 23)
Zedio (76536, 23)
Elizabeth (76577, 23)
Cfc Mimoso Do Sul Filial (76600, 23)
Luana (76639, 23)
Autoescola (76742, 23)
Sultransito (76827, 23)
Reinaldo (76847, 23)
R

Luciana (83646, 25)
Juliana (83661, 25)
Otavio (83679, 25)
Simone (83692, 25)
Jussara (83708, 25)
Giulia (83718, 25)
LAÉLIO (83750, 25)
Patrick (83756, 25)
Flávio (83775, 25)
jose Francisco (83780, 25)
Higor (83788, 25)
Ana Mielke (83792, 25)
Andres (83808, 25)
robertin (83811, 25)
Priscila (83814, 25)
Rayeli (83862, 25)
Jeane (83881, 25)
Camila (83908, 25)
Regis (83926, 25)
Elizabete (83933, 25)
Alexandre (83937, 25)
Állida (83945, 25)
Andressa (83965, 25)
Lucas (83986, 25)
Hercília (83989, 25)
Esmaely (83997, 25)
Marcos Antônio (84005, 25)
Claudia (84011, 25)
Maria (84016, 25)
Alessandra Costa (84019, 25)
Maria (84022, 25)
Ana Paula (84026, 25)
Julya (84029, 25)
Stefano Jr (84034, 25)
Breno (84039, 25)
Antonio (84041, 25)
Adriana (84050, 25)
Josenildo (84056, 25)
Max (84057, 25)
Livia (84058, 25)
Marcelo (84059, 25)
Ranielly (84060, 25)
Henry (84061, 25)
Raulino (84063, 25)
Joelma (84066, 25)
Breno (84067, 25)
Vanilúzia (84076, 25)
Washington (84077, 25)
Etienne (84079, 25)
Ingrid (8

Patrick (89557, 25)
Karla (89561, 25)
Celinha (89572, 25)
LINO (89579, 25)
Renata (89584, 25)
Marcele (89647, 25)
None (89658, 25)
Letícia (89659, 25)
Rosiani (89660, 25)
Albert (89662, 25)
Vivyan (89663, 25)
Wadson (89665, 25)
Walter Ignacio (89683, 25)
Felipe (89701, 25)
Mining Bitcoin Hardware ⛅️ (89875, 25)
lucas (89894, 25)
Thais M (89907, 25)
Matheus (89924, 25)
Guiii (89942, 25)
Paz (89951, 25)
Paula (89962, 25)
juliet (89966, 25)
CARLOS VINICIOS CASAGRANDE RIBEIRO (89967, 25)
Michael (89968, 25)
Eloisa (89973, 25)
PAULO ERNESTO (89981, 25)
Maria Jose (89984, 25)
Autoescola (90002, 25)
Elizabethe (90007, 25)
Arturo (90030, 25)
Manuella (90060, 25)
Bill (90071, 25)
Valéria (90075, 25)
Auto Escola (90094, 25)
Saulo (90160, 25)
Silvestre (90162, 25)
Henrique (90202, 25)
Karloz (90208, 25)
Autoescola (90309, 25)
Everton (90316, 25)
Vagner (90320, 25)
Lucilia (90322, 25)
Ronaldo (90328, 25)
Weiglas (90338, 25)
Walace (90354, 25)
Gabriel (90359, 25)
Valmir (90362, 25)
Amanda (90364, 2

Elvio (95670, 27)
Jessica Jenniffer ❤️ (95671, 27)
Aurea (95672, 27)
Lorena (95678, 27)
João Pedro (95685, 27)
Fernanda (95693, 27)
Salusa Edith (95695, 27)
Gabriel (95706, 27)
Nadir (95710, 27)
Danilo (95717, 27)
Oliveira (95718, 27)
Givanildo Alves Castro (95727, 27)
Wemerson (95733, 27)
Ycaro Cesar (95744, 27)
Luíz (95747, 27)
Kadu (95768, 27)
Alexandra (95784, 27)
Breno (95795, 27)
Valdeir (95811, 27)
Simone (96034, 27)
Fernanda (96044, 27)
Arlan (96049, 27)
victor (96056, 27)
Marissa (96080, 27)
WTF-ULTRA-RIGHT (96081, 27)
Alan (96087, 27)
Gabriel (96111, 27)
André (96116, 27)
Joel (96124, 27)
Norminha (96148, 27)
Thamiris (96155, 27)
Fabrício Fernandes.. (96180, 27)
Emanuelle (96197, 27)
Yuri (96202, 27)
Igor (96215, 27)
Raphael (96227, 27)
Mariella (96230, 27)
Gessica (96237, 27)
Paulo (96246, 27)
Rossimery (96251, 27)
Cynthia (96258, 27)
Diego (96285, 27)
Vânia (96294, 27)
Cassiano (96296, 27)
Raquel (96303, 27)
Edson (96313, 27)
Thais (96319, 27)
Patricia (96330, 27)
Elias (96

May'Crivelari (101193, 27)
Danubia (101211, 27)
Tatiana (101252, 27)
Fabiana (101259, 27)
Geisa (101267, 27)
Bruno (101274, 27)
Leandro🇪🇺 (101286, 27)
Gean (101320, 27)
Rafaela (101339, 27)
Mfsouza (101349, 27)
Carlos Renato (101421, 27)
Mayana (101426, 27)
Salua (101435, 27)
Cezar (101448, 27)
Marize (101450, 27)
Bruno (101453, 27)
Adriana (101475, 27)
Alliny (101504, 27)
Wander (101516, 27)
John (101538, 27)
Jheniffer (101541, 27)
Hanna (101544, 27)
Marco (101555, 27)
None (101557, 27)
Thi (101560, 27)
Isabela (101561, 27)
Guarnierne (101562, 27)
MIRIANNE SILVA MELO PIZETTA (101563, 27)
GLÁUCIA BERNABÉ (101564, 27)
Jozelena (101565, 27)
Aline (101573, 27)
Marcos (101574, 27)
Lucas Am (101578, 27)
Juliana Cabral (101607, 27)
Simone (101647, 27)
Cm (101657, 27)
Rosy (101673, 27)
C.a lopes (101690, 27)
Acleia (101705, 27)
None (101723, 27)
Ricardo (101742, 27)
Milene (101750, 27)
Gabriiel (101766, 27)
Clara (101772, 27)
Juacy Rosa (101787, 27)
Cinthia (101801, 27)
SUELI (101827, 27)
Mat

Carine (108313, 27)
GUILHERME MORAIS DIAS (108314, 27)
Pedro (108322, 27)
None (108340, 27)
Layla (108352, 27)
BV Interiors Design Projects (108364, 27)
Amanda (108365, 27)
Vinicius (108388, 27)
Eliete (108415, 27)
Lúcio (108506, 27)
Nilzete (108533, 27)
None (108552, 27)
Vanessa (108564, 27)
Gustavo (108586, 27)
None (108591, 27)
Nathalia (108599, 27)
Rodrigo (108608, 27)
Maria (108622, 27)
Anderson (108642, 27)
Walace (108656, 27)
Edilson Pereira Ramos (108677, 27)
Arthur (108686, 27)
Patricia Teixeira (108692, 27)
Igor (108790, 27)
Thiago (108794, 27)
Yghor (108817, 27)
None (108818, 27)
Dhionatas (108819, 27)
LINO (108820, 27)
Paula (108830, 27)
Rafael (108880, 27)
Valdecir (108884, 27)
Aline (108907, 27)
Renatta (108938, 27)
None (108956, 27)
Durvalina Patricia (109002, 27)
Luciano (109008, 27)
None (109021, 27)
Liseto (109027, 27)
None (109028, 27)
Akira (109053, 27)
Marcos (109087, 27)
None (109111, 27)
Eliomar (109115, 27)
Cláudia (109133, 27)
Larissa (109163, 27)
Marcio Netto 

Rudy (113739, 27)
Junior (113753, 27)
Alessandro (113761, 27)
None (113816, 27)
Gabriel (113826, 27)
Ana Paula (113841, 27)
Pedro Henrique (113855, 27)
Rogério (113859, 27)
José (113871, 27)
Karina (114033, 27)
Dan (114074, 27)
Leandro (114075, 27)
Dariane (114077, 27)
None (114113, 27)
Oscar (114114, 27)
None (114115, 27)
Karol (114123, 27)
Wendel (114139, 27)
Wil (114140, 27)
Laudiane (114154, 27)
Josimar (114216, 27)
Elisangela (114234, 27)
None (114280, 27)
None (114287, 27)
Altagenia (114303, 27)
Walbert (114316, 27)
Marcelo (114341, 27)
Maycon (114359, 27)
None (114441, 27)
Luciano (114452, 27)
Juliano (114459, 27)
None (114461, 27)
Maycon (114472, 27)
T (114476, 27)
None (114489, 27)
Bruna Steele (114492, 27)
None (114493, 27)
Autoescola João Neiva (114522, 27)
Angelo (114524, 27)
Jairo (114538, 27)
Gustavo (114556, 27)
Suzana Barbosa (114560, 27)
JEiZZN (114580, 27)
Lunei (114584, 27)
. (114595, 27)
Carlos (114609, 27)
None (114616, 27)
Cesar (114631, 27)
None (114647, 27)
Sam 

Maria Beatrix (120842, 27)
- (120856, 27)
None (120874, 27)
None (120879, 27)
Delis (120884, 27)
./ (120901, 27)
Luiz Fernando (120919, 27)
Adriana (120924, 27)
Mayane (120930, 27)
Alexandre (120932, 27)
None (121232, 27)
Lucas (121237, 27)
None (121253, 27)
Bruno (121257, 27)
Esthevão (121266, 27)
Lorenzo (121269, 27)
Lara (121272, 27)
None (121291, 27)
Ravyan (121294, 27)
Ana Luiza (121308, 27)
PR (121312, 27)
Rogerio (121315, 27)
None (121319, 27)
Cassiano (121323, 27)
None (121326, 27)
Carla (121372, 27)
Mirtes (121374, 27)
Tamara (121381, 27)
None (121398, 27)
None (121403, 27)
None (121404, 27)
None (121432, 27)
Wellington (121433, 27)
Euquiasdsantos@gmail.com (121434, 27)
marilia (121444, 27)
Carolina (121483, 27)
Geeni (121486, 27)
Thiago (121492, 27)
Rosi (121499, 27)
None (121500, 27)
Camila (121508, 27)
None (121515, 27)
None (121522, 27)
None (121527, 27)
Leo (121536, 27)
None (121733, 27)
Silmara (121740, 27)
:) (121754, 27)
Fábio (121761, 27)
Evelin (121775, 27)
Michelly 

Marcos (127339, 27)
None (127354, 27)
Elcinei (127367, 27)
None (127388, 27)
Deusivane (127398, 27)
None (127410, 27)
None (127421, 27)
Phill (127455, 27)
Thiago (127458, 27)
Leandra (127494, 27)
Luciano (127503, 27)
None (127512, 27)
Renilson (127557, 27)
None (127559, 27)
Rodrigo (127567, 27)
None (127574, 27)
None (127584, 27)
Lavinia (127588, 27)
None (127604, 27)
Letícia (127609, 27)
Fernando (127612, 27)
None (127615, 27)
Thiago (127618, 27)
None (127629, 27)
None (127633, 27)
None (127636, 27)
Weverton (127639, 27)
Guilherme (127644, 27)
Caroll (127678, 27)
None (127689, 27)
Cfc Imperial (127691, 27)
Lorrainy (127697, 27)
None (127699, 27)
Rodrigo (127700, 27)
Raquel (127727, 27)
Arthur (127737, 27)
WELLINGTON FRAGA GARCIA (127738, 27)
Naiara (127746, 27)
Evelyn (127747, 27)
None (127755, 27)
None (127783, 27)
None (127794, 27)
Marcos (127802, 27)
Davi (127823, 27)
Helio (127837, 27)
Hariel (127849, 27)
Débora (127870, 27)
None (127880, 27)
Luis Felipe (127921, 27)
Mateus Zocate

Lourrana (133075, 27)
Julia ✨ (133180, 27)
Camila (133200, 27)
Isabelly (133209, 27)
Joelson Coelho (133212, 27)
Clínica Centram (133567, 27)
Isabela (133580, 27)
None (133591, 27)
None (133599, 27)
Gabriela (133602, 27)
None (133615, 27)
Marcos (133641, 27)
None (133644, 27)
Douglas (133646, 27)
None (133651, 27)
Maria Clara (133656, 27)
None (133658, 27)
EVERALDO FERREIRA DE ARAUJO (133659, 27)
Julia (133674, 27)
Selva (133677, 27)
None (133708, 27)
Nilsilene (133713, 27)
Ju (133737, 27)
None (133780, 27)
Nelson (133788, 27)
None (133801, 27)
None (133830, 27)
None (133850, 27)
Samir (133852, 27)
Camila (133879, 27)
Flávia (133888, 27)
Daniel (133910, 27)
Pedro (133926, 27)
None (133948, 27)
Uanderson (133957, 27)
Juliane (133978, 27)
Borracharia Altinopolis (133988, 27)
Izabela (133999, 27)
Leonardo (134009, 27)
None (134025, 27)
Sérgio (134027, 27)
José Augusto (134035, 27)
Heitor (134059, 27)
Isabella (134076, 27)
None (134090, 27)
Nájila (134105, 27)
Cristiano (134113, 27)
None (

penalidadebot (139593, 27)
None (139620, 27)
None (139662, 27)
Leonie (139672, 27)
None (139683, 27)
None (139711, 27)
Ykaro (139715, 27)
None (139740, 27)
Jussara Helena (139744, 27)
None (139754, 27)
J (139756, 27)
Fernanda (139799, 27)
F (139825, 27)
Gabrielly (139851, 27)
None (139856, 27)
None (139915, 27)
Edgar (139927, 27)
None (139934, 27)
None (139942, 27)
Valcilene Gomes (139945, 27)
Miriam (139966, 27)
None (139975, 27)
Mariane (139985, 27)
Gilvan (139999, 27)
None (140009, 27)
None (140019, 27)
Detran ES SGIP_bot (140029, 27)
None (140107, 27)
None (140114, 27)
None (140175, 27)
None (140203, 27)
Gil (140250, 27)
Richardson (140275, 27)
Fabrício (140311, 27)
None (140325, 27)
None (140332, 27)
Lucas (140381, 27)
None (140416, 27)
De (140422, 27)
Aderlan (140451, 27)
Claudio (140457, 27)
None (140465, 27)
Barbosa (140474, 27)
None (140481, 27)
None (140627, 27)
Theandra (140659, 27)
Stheffane (140663, 27)
None (140674, 27)
None (140687, 27)
Thai (140694, 27)
None (140706, 27

Daniel (149064, 27)
Cris (149070, 27)
None (149093, 27)
None (149130, 27)
João Paulo Cardoso (149144, 27)
None (149264, 27)
Rony (149294, 27)
None (149307, 27)
None (149368, 27)
Kátia (149385, 27)
Jailer (149402, 27)
None (149593, 27)
None (149690, 27)
None (149698, 27)
None (149865, 27)
Valcy (149916, 27)
None (149947, 27)
None (150030, 27)
Zambon (150034, 27)
Paula (150049, 27)
None (150080, 27)
Juliana (150113, 27)
None (150149, 27)
None (150295, 27)
Welington (150326, 27)
None (150402, 27)
None (150486, 27)
Karina (150535, 27)
None (150583, 27)
None (150625, 27)
None (150650, 27)
Fabiola (150672, 27)
Rodrigo (150699, 27)
CFC (150768, 27)
None (150772, 27)
Leo Santos (150788, 27)
Elizete (150870, 27)
None (150907, 27)
Camila (150918, 27)
Rosangela (150968, 27)
None (150984, 27)
CFC (151008, 27)
Clips (151059, 27)
Everaldo (151072, 27)
None (151239, 27)
None (151244, 27)
Aline (151247, 27)
Trabalho (151247, 27)
Carla (151351, 27)
None (151365, 27)
C M (151369, 27)
None (151454, 27)
D

Unnamed: 0,id,type,date,from,from_id,text,forwarded_from,photo,width,height,...,action,media_type,duration_seconds,sticker_emoji,discard_reason,contact_information,location_information,live_location_period_seconds,members,contact_vcard
0,12279,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao preciso de uma s...,,,,,...,,,,,,,,,,
1,12280,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao pois nao consigo...,Kelly Assis,chats/chat_0001/photos/photo_1@01-11-2019_16-5...,1280.0,720.0,...,,,,,,,,,,
2,12281,message,2019-11-01T17:12:23,Kelly Assis,user980274329,,Kelly Assis,chats/chat_0001/photos/photo_2@01-11-2019_17-1...,720.0,1280.0,...,,,,,,,,,,
3,12282,message,2019-11-01T17:12:23,Kelly Assis,user980274329,Fala q a senha esta bloqueada,Kelly Assis,,,,...,,,,,,,,,,
4,12285,message,2019-11-01T17:18:30,Kelly Assis,user980274329,Se for ai qual setor posso proceder e o que de...,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151624,12960,message,2019-11-06T09:28:54,Coordenação Renach,user692170770,"Bom dia, favor informar o CPF",,,,,...,,,,,,,,,,
151625,12961,message,2019-11-06T09:32:30,Bruno Alvarenga,user551167743,03930767767,,,,,...,,,,,,,,,,
151626,12965,message,2019-11-06T09:39:54,Coordenação Renach,user692170770,assim que for feito o processo de renovação o ...,,,,,...,,,,,,,,,,
151627,12972,message,2019-11-06T10:03:15,Bruno Alvarenga,user551167743,Tks. Bom trabalho.,,,,,...,,,,,,,,,,


In [64]:
mensagens_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151629 entries, 0 to 151628
Data columns (total 27 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            151629 non-null  int64  
 1   type                          151629 non-null  object 
 2   date                          151629 non-null  object 
 3   from                          140226 non-null  object 
 4   from_id                       151438 non-null  object 
 5   text                          151629 non-null  object 
 6   forwarded_from                4142 non-null    object 
 7   photo                         9917 non-null    object 
 8   width                         10299 non-null   float64
 9   height                        10299 non-null   float64
 10  reply_to_message_id           3157 non-null    float64
 11  edited                        222 non-null     object 
 12  file                          1567 non-null 

In [71]:
mensagens_df.to_csv("mensagens_telegram_amostras.csv",index=False,header=False, sep='¿')

In [72]:
mensagens_df.head()

Unnamed: 0,id,type,date,from,from_id,text,forwarded_from,photo,width,height,...,action,media_type,duration_seconds,sticker_emoji,discard_reason,contact_information,location_information,live_location_period_seconds,members,contact_vcard
0,12279,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao preciso de uma s...,,,,,...,,,,,,,,,,
1,12280,message,2019-11-01T16:58:18,Kelly Assis,user980274329,Preciso urgente de orientaçao pois nao consigo...,Kelly Assis,chats/chat_0001/photos/photo_1@01-11-2019_16-5...,1280.0,720.0,...,,,,,,,,,,
2,12281,message,2019-11-01T17:12:23,Kelly Assis,user980274329,,Kelly Assis,chats/chat_0001/photos/photo_2@01-11-2019_17-1...,720.0,1280.0,...,,,,,,,,,,
3,12282,message,2019-11-01T17:12:23,Kelly Assis,user980274329,Fala q a senha esta bloqueada,Kelly Assis,,,,...,,,,,,,,,,
4,12285,message,2019-11-01T17:18:30,Kelly Assis,user980274329,Se for ai qual setor posso proceder e o que de...,,,,,...,,,,,,,,,,


In [77]:
mensagens_df.set_index()

Coordenação Renach                                  49839
Oszilene de Freitas CFC Aparecida - Nova Venécia     1120
Kelly Assis                                           884
Rozi Marchesi                                         830
Cliparh Clinica Do Detran                             824
                                                    ...  
Juliana Viudes                                          1
Victor Rafalski                                         1
Mônica Oliveira                                         1
Gasha                                                   1
Junior Anceschi                                         1
Name: from, Length: 4797, dtype: int64

In [81]:
mensagens_df.describe()

Unnamed: 0,id,width,height,reply_to_message_id,duration_seconds,live_location_period_seconds
count,151629.0,10299.0,10299.0,3157.0,284.0,1.0
mean,94535.702128,987.060297,1027.669968,102036.158695,25.757042,900.0
std,47406.154484,312.505398,319.764637,47259.94957,24.26417,
min,5.0,90.0,18.0,8.0,1.0,900.0
25%,54121.0,720.0,720.0,60009.0,11.0,900.0
50%,94459.0,960.0,1276.0,108388.0,19.0,900.0
75%,135310.0,1280.0,1280.0,143381.0,34.0,900.0
max,176099.0,4608.0,4570.0,175665.0,262.0,900.0


In [83]:
mensagens_df.set_index(['id', 'from_id'])

Unnamed: 0_level_0,Unnamed: 1_level_0,type,date,from,text,forwarded_from,photo,width,height,reply_to_message_id,edited,...,action,media_type,duration_seconds,sticker_emoji,discard_reason,contact_information,location_information,live_location_period_seconds,members,contact_vcard
id,from_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
12279,user980274329,message,2019-11-01T16:58:18,Kelly Assis,Preciso urgente de orientaçao preciso de uma s...,,,,,,,...,,,,,,,,,,
12280,user980274329,message,2019-11-01T16:58:18,Kelly Assis,Preciso urgente de orientaçao pois nao consigo...,Kelly Assis,chats/chat_0001/photos/photo_1@01-11-2019_16-5...,1280.0,720.0,,,...,,,,,,,,,,
12281,user980274329,message,2019-11-01T17:12:23,Kelly Assis,,Kelly Assis,chats/chat_0001/photos/photo_2@01-11-2019_17-1...,720.0,1280.0,,,...,,,,,,,,,,
12282,user980274329,message,2019-11-01T17:12:23,Kelly Assis,Fala q a senha esta bloqueada,Kelly Assis,,,,,,...,,,,,,,,,,
12285,user980274329,message,2019-11-01T17:18:30,Kelly Assis,Se for ai qual setor posso proceder e o que de...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12960,user692170770,message,2019-11-06T09:28:54,Coordenação Renach,"Bom dia, favor informar o CPF",,,,,,,...,,,,,,,,,,
12961,user551167743,message,2019-11-06T09:32:30,Bruno Alvarenga,03930767767,,,,,,,...,,,,,,,,,,
12965,user692170770,message,2019-11-06T09:39:54,Coordenação Renach,assim que for feito o processo de renovação o ...,,,,,,,...,,,,,,,,,,
12972,user551167743,message,2019-11-06T10:03:15,Bruno Alvarenga,Tks. Bom trabalho.,,,,,,,...,,,,,,,,,,


In [92]:
mensagens_df.from_id

0         user980274329
1         user980274329
2         user980274329
3         user980274329
4         user980274329
              ...      
151624    user692170770
151625    user551167743
151626    user692170770
151627    user551167743
151628    user692170770
Name: from_id, Length: 151629, dtype: object

In [96]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [102]:
''' Not all variables are being undestood as strings so we have to force it'''
preprocessed_text_data = mensagens_df['text'].to_list()

In [None]:
# https://sanjayasubedi.com.np/nlp/nlp-with-python-document-clustering/ 

In [105]:
vec = TfidfVectorizer()
vec.fit(preprocessed_text_data)
features = vec.transform(preprocessed_text_data)

AttributeError: 'list' object has no attribute 'lower'