In [18]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn

import datetime
from dateutil import parser

from pymongo import MongoClient
from tqdm import tqdm

In [2]:
client = MongoClient()
db = client['rf_test']
col_entries = db['entries']
col_inst = db['inst']
col_inds = db['inds']
col_nouns = db['nouns']

## First look at entries

### What are those?

In [3]:
def list_entries():
    d = col_entries.distinct("type")
    return [x for x in d]

### What do we have the most?

In [8]:
entries = Out[4]

In [12]:
def count_entry():
    entry_count = []
    for entry in entries:
        c = col_entries.find({"type":entry}).count()
        print("{}|{}".format(c,entry))
        entry_count.append((c,entry))
    return entry_count

In [14]:
entry_count = Out[13]

In [17]:
sorted(entry_count, key= lambda x: x[0])[::-1]

[(177928, 'URL'),
 (52303, 'Username'),
 (11210, 'InternetDomainName'),
 (7060, 'Source'),
 (4916, 'GeoEntity'),
 (4661, 'Hashtag'),
 (4238, 'Company'),
 (3700, 'Person'),
 (3307, 'OrgEntity'),
 (2704, 'City'),
 (2616, 'Malware'),
 (2196, 'IndustryTerm'),
 (2049, 'ProvinceOrState'),
 (1917, 'Organization'),
 (1384, 'Product'),
 (923, 'Technology'),
 (861, 'IpAddress'),
 (680, 'Position'),
 (452, 'Facility'),
 (335, 'MalwareSignature'),
 (332, 'FileName'),
 (210, 'Country'),
 (207, 'Region'),
 (126, 'PublishedMedium'),
 (125, 'Hash'),
 (123, 'EmailAddress'),
 (97, 'CyberVulnerability'),
 (66, 'Industry'),
 (48, 'MetaType'),
 (43, 'NaturalFeature'),
 (42, 'AttackVector'),
 (29, 'Feature'),
 (28, 'Topic'),
 (25, 'OperatingSystem'),
 (23, 'Operation'),
 (22, 'TVShow'),
 (18, 'SourceMediaType'),
 (16, 'Religion'),
 (16, 'MedicalCondition'),
 (15, 'EntityList'),
 (15, 'MalwareCategory'),
 (15, 'WinRegKey'),
 (13, 'Holiday'),
 (12, 'Commodity'),
 (12, 'ProgrammingLanguage'),
 (9, 'MarketIndex

In these sources of data, usernames seem interesting.

### Usernames

First, we want to look through what are the usernames we have in the database. To find all usernames, we can do:

In [4]:
def find_all_usernames():
    return [x for x in col_entries.find({"type":"Username"})]

Since there are about 50k usernames, let's just look at a random one:

In [7]:
u = find_all_usernames()[30]

u

{'_id': ObjectId('58d9f0e4df3e728bd7e112e7'),
 'created': '2014-07-16T14:44:21.851Z',
 'created_at': '2013-12-09T15:48:56.000Z',
 'curated': 0,
 'domain': 'B_E-R',
 'hits': 40,
 'id': 'LTRvjC',
 'meta_type': 'type:Username',
 'name': '@felicita_andrad',
 'type': 'Username'}

Note the 'id' field. Since this correspond to the author field in the instance, we can use this to count the contributions of the username. Therefore, for each username, we can count or list all of the instance that they are associated with.

In [10]:
def find_inst_by_username(userid):
    return [x for x in col_inst.find({"attributes.authors": userid})]

In [11]:
find_inst_by_username("LTRvjC")

[{'_id': ObjectId('58d9f0b6df3e728bd7de539f'),
  'attributes': {'analyzed': '2015-09-21T17:29:21.094Z',
   'authors': ['LTRvjC'],
   'binning_id': 'GoX8I15DMMc',
   'canonic_id': 'GoX8I15DMMc',
   'document_external_id': '646013299067228160',
   'document_offset': 0,
   'document_url': 'url:https://twitter.com/felicita_andrad/statuses/646013299067228160',
   'entities': ['url:http://www.20minutos.es/noticia/2561290/0/app-store-apple/victima-ciberataque-wechat/xcodeghost/',
    'JDYK14',
    'B_LyO',
    'O8A5tj'],
   'fragment_count': 1,
   'function': 'id',
   'general_negative': 0.0,
   'general_positive': 0.0,
   'indicator': 'ciberataque',
   'meta_type': 'type:CyberAttack',
   'negative': 0.0,
   'positive': 0.0,
   'sentiments': {'activism': 0.0,
    'general_negative': 0.0,
    'general_positive': 0.0,
    'negative': 0.0,
    'positive': 0.0,
    'violence': 0.0},
   'target': ['JDYK14'],
   'target_string': 'EFE Un',
   'topics': ['KPzZAE'],
   'user_data': {'followers_count':

We could also go through the data and see which username "have the most to say". 

In [19]:
def username_ranking():
    usernames_ranks = []
    usernames = find_all_usernames()
    for u in tqdm(usernames):
        uid = u['id']
        name = u['name']
        count = col_inst.find({"attributes.authors":uid}).count()
        usernames_ranks.append((uid, name, count))
    return sorted(username_ranks, key=lambda x: x[2])

On my computer, this process takes around 5 minutes to go through and count up all the usernames, we got about 3 username per seconds

In [20]:
username_ranking()[:20]

  1%|          | 339/52303 [01:51<4:50:21,  2.98it/s]

KeyboardInterrupt: 