In [None]:
import pandas as pd
from pymongo import MongoClient


def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [None]:
t = read_mongo("wikidata", "triplet_train")

In [None]:
by_predicate = t.groupby(by=['Predicate','PredicateId'],as_index=False).agg({'Object':'count'})
by_predicate['Count'] = by_predicate['Object']
del by_predicate['Object']

In [None]:
p = read_mongo("wikidata", "property")
p.head()

In [None]:
by_predicate = by_predicate.sort_values(by='Count', axis=0, ascending=False)
by_predicate.head()

In [None]:
pp = p.merge(by_predicate, how='left', left_on='WikidataId', right_on='PredicateId')
pp.head()

In [None]:
del pp['Qualifier']
del pp['ReadTitleUk']
del pp['SectionId']
del pp['Used']

pp['TitleEn'] = pp['Title']
del pp['Title']

pp['TitleUk'] = pp['Predicate']
del pp['Predicate']

pp['Id'] = pp['WikidataId']
del pp['WikidataId']
del pp['PredicateId']
pp.head()

In [None]:
pp['Count'] = pp['Count'].fillna(0)
pp['Count'] = pp['Count'].astype(int)

In [None]:
pp = pp.sort_values(by='Count', axis=0, ascending=False)
pp.head()

In [None]:
pp.to_csv('D:\DRIVE\MS CS UCU\Machine Learning\Project\reports\by-predicate-count.csv', index=False, encoding='utf8', sep='\t')

In [None]:
t['Len'] = t.apply(lambda row: len(row['Text']),axis=1)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
t['Len'].hist(bins=80)

In [None]:
len(t[t.Len > 1000])

In [None]:
t[t.Len < 500]['Len'].hist(bins=80)