In [1]:
import requests
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

In [2]:
from pymongo import MongoClient
dbName = "APIchat"
mongodbURL = f"mongodb://localhost/{dbName}"
print(mongodbURL)
client = MongoClient(mongodbURL)
# Conectar la db
db = client.get_database()
db = client.get_default_database()["comments"]
print(db)

mongodb://localhost/APIchat
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'APIchat'), 'comments')


## Extraer todos los mensajes y usuario

In [3]:
mensajes = list(db.find({"type": "message"}, {"user_id":1, "text":1, "_id":0}))
len(mensajes)

36

In [6]:
usuarios = [e["user_id"] for e in mensajes]
textos = [e["text"] for e in mensajes]
#print(len(usuarios))

In [None]:
user_names = []
for u in usuarios:
    nombres = list(db.find({"_id": u}, {"_id":0}))
    user_names.append(nombres[0]["username"]) 
print(user_names)

## Extraer los comentarios a un data frame

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/lee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def textCleaner(frase):
    stop_words = set(stopwords.words('english')) 
    # Tokeniza
    word_tokens = word_tokenize(frase)
    # Extract whatever is not a stop word
    cleaned = [w for w in word_tokens if w not in stop_words]
    joined = " ".join(cleaned)
    return joined

In [9]:
cleaned = [textCleaner(e) for e in textos ]
print(cleaned)

["I sure American . Disappointed realize 's UK . Great example getting arrested front kids . Thoroughly deserve though", 'These parents made stand naughty corner grounded pocket money next month !', "Why I feeling point near future 'protesters ' experience protesters US ; discover harsh reality stupidness refusing follow simplest lockdown requests ? !", 'Perhaps media giving consistent message . Too many articles claiming lockdown ’ working start getting back normal enough articles demonstrating benefits lockdown importance abiding guidance .', "Because many people got 'Locked ' brains think Lockdown NEVER end news pumping heads day day 2 months Lockdown Stay Home Social Distancing", 'Rats breeding rats', 'Brainless chicken .', 'Well get virus dont treat . Serves right .', 'As government forking millions pounds money pay people sit backsides fun , rest us put lives risk working whatever field keep country going hell . Why earth would want lie figures . It serves purpose . Just cause lo

In [10]:
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(cleaned)
print(list(count_vectorizer.vocabulary_.keys()))
m = sparse_matrix.todense()
print(m.shape)
print(m[0])

['sure', 'american', 'disappointed', 'realize', 'uk', 'great', 'example', 'getting', 'arrested', 'front', 'kids', 'thoroughly', 'deserve', 'though', 'these', 'parents', 'made', 'stand', 'naughty', 'corner', 'grounded', 'pocket', 'money', 'next', 'month', 'why', 'feeling', 'point', 'near', 'future', 'protesters', 'experience', 'us', 'discover', 'harsh', 'reality', 'stupidness', 'refusing', 'follow', 'simplest', 'lockdown', 'requests', 'perhaps', 'media', 'giving', 'consistent', 'message', 'too', 'many', 'articles', 'claiming', 'working', 'start', 'back', 'normal', 'enough', 'demonstrating', 'benefits', 'importance', 'abiding', 'guidance', 'because', 'people', 'got', 'locked', 'brains', 'think', 'never', 'end', 'news', 'pumping', 'heads', 'day', 'months', 'stay', 'home', 'social', 'distancing', 'rats', 'breeding', 'brainless', 'chicken', 'well', 'get', 'virus', 'dont', 'treat', 'serves', 'right', 'as', 'government', 'forking', 'millions', 'pounds', 'pay', 'sit', 'backsides', 'fun', 'rest

## Sacar la matriz de las palabras

In [11]:
doc_term_matrix = sparse_matrix.todense()
df_words = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=usuarios)
df_words.shape

(36, 272)

## Agrupar los usuarios 

In [12]:
user_words = df_words.groupby(df_words.index).sum()
user_words.shape

(29, 272)

## Calcular las distancias

In [13]:
user_dist = pd.DataFrame(1/(1 + squareform(pdist(user_words, 'cosine'))),
                         index=user_words.index, columns=user_words.index)

In [14]:
user_dist

Unnamed: 0,5ead9138877d9a546966a8ed,5ead9171877d9a546966a8ee,5ead9179877d9a546966a8ef,5ead9196877d9a546966a8f1,5ead91b6877d9a546966a8f2,5ead91c2877d9a546966a8f3,5ead91cb877d9a546966a8f4,5ead91da877d9a546966a8f5,5ead91e4877d9a546966a8f6,5eadecef206911bea1263687,...,5eadedb0206911bea1263691,5eadedb9206911bea1263692,5eadedc8206911bea1263693,5eadedf1206911bea1263694,5eadee1e206911bea1263695,5eadee2a206911bea1263696,5eadee3d206911bea1263697,5eadee4d206911bea1263698,5eadee59206911bea1263699,5eadee76206911bea126369a
5ead9138877d9a546966a8ed,1.0,0.5,0.5,0.5,0.512195,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.566798,0.5
5ead9171877d9a546966a8ee,0.5,1.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5ead9179877d9a546966a8ef,0.5,0.5,1.0,0.523407,0.507246,0.5,0.553235,0.5,0.5,0.526316,...,0.5,0.5,0.521281,0.5,0.52549,0.5,0.5,0.5,0.5,0.5
5ead9196877d9a546966a8f1,0.5,0.5,0.523407,1.0,0.516499,0.5,0.522484,0.5,0.5,0.5,...,0.5,0.5,0.523913,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5ead91b6877d9a546966a8f2,0.512195,0.5,0.507246,0.516499,1.0,0.5,0.506969,0.511005,0.509731,0.518519,...,0.518519,0.5,0.530967,0.5,0.508815,0.5,0.5,0.5,0.5,0.5
5ead91c2877d9a546966a8f3,0.5,0.5,0.5,0.5,0.5,1.0,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5ead91cb877d9a546966a8f4,0.5,0.5,0.553235,0.522484,0.506969,0.5,1.0,0.5,0.513198,0.5,...,0.5,0.5,0.5,0.5,0.511948,0.5,0.5,0.5,0.5,0.5
5ead91da877d9a546966a8f5,0.5,0.5,0.5,0.5,0.511005,0.5,0.5,1.0,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5ead91e4877d9a546966a8f6,0.5,0.5,0.5,0.5,0.509731,0.5,0.513198,0.5,1.0,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.524794
5eadecef206911bea1263687,0.5,0.5,0.526316,0.5,0.518519,0.5,0.5,0.5,0.5,1.0,...,0.5,0.5,0.556831,0.5,0.5,0.5,0.5,0.5,0.5,0.5


## Calcular el más cercano

In [None]:
similar_users = []

for c in user_dist.columns:
    similar_users.append(user_dist[c].sort_values(ascending=False)[1:2].index[0])
print(similar_users)

In [None]:
for i in range(user_dist.shape[0]):
    similar_users.append(user_dist.iloc[i].sort_values(ascending=False)[1:2])    

In [None]:
print(user_dist.columns[0])

In [15]:
dic_rec = {}
for i in range(len(user_dist.columns)):
    dic_rec[user_dist.columns[i]] = user_dist.iloc[i].sort_values(ascending=False)[1:2].index[0]
    

In [16]:
print(dic_rec)

{ObjectId('5ead9138877d9a546966a8ed'): ObjectId('5eadee59206911bea1263699'), ObjectId('5ead9171877d9a546966a8ee'): ObjectId('5eadee76206911bea126369a'), ObjectId('5ead9179877d9a546966a8ef'): ObjectId('5ead91cb877d9a546966a8f4'), ObjectId('5ead9196877d9a546966a8f1'): ObjectId('5eadedc8206911bea1263693'), ObjectId('5ead91b6877d9a546966a8f2'): ObjectId('5eadedc8206911bea1263693'), ObjectId('5ead91c2877d9a546966a8f3'): ObjectId('5eadee76206911bea126369a'), ObjectId('5ead91cb877d9a546966a8f4'): ObjectId('5ead9179877d9a546966a8ef'), ObjectId('5ead91da877d9a546966a8f5'): ObjectId('5eaded69206911bea126368c'), ObjectId('5ead91e4877d9a546966a8f6'): ObjectId('5eadee76206911bea126369a'), ObjectId('5eadecef206911bea1263687'): ObjectId('5eadecfc206911bea1263688'), ObjectId('5eadecfc206911bea1263688'): ObjectId('5eadecef206911bea1263687'), ObjectId('5eaded12206911bea1263689'): ObjectId('5eadecfc206911bea1263688'), ObjectId('5eaded55206911bea126368a'): ObjectId('5eadedf1206911bea1263694'), ObjectId('5

## Buscar usuario

In [28]:
u = 'GreenElephant'

In [39]:
# Sacar id
u_id = list(db.find({"username": u}, {"_id:1"}))
if len(u_id) == 0:
    print("error")
else:
    print(u_id[0]["_id"])
    rec_name = db.find_one({"_id":dic_rec[(u_id[0]["_id"])]})["username"]

# Buscar id
#print(dic_rec[u_id])

# Traducir
#rec_name = db.find_one({"_id":dic_rec[u_id]})["username"]
print(rec_name)

5ead91e4877d9a546966a8f6
The Pipes
