In [1]:
# Python2
import json
from nltk import ngrams
import codecs
import unicodedata
import operator
import unicodecsv as csv

from os import listdir
from os.path import isfile, join
import re

import nltk

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

In [2]:
def income_message_processing(message, remove_bot_messages=True):
    text = message['text'].encode('utf8')
    #import re
    #my_str = 'Собеседник:\nпривет'
    #my_mes = re.split('Собеседник:\n', my_str, maxsplit = 1)[-1].replace('Собеседник отправил стикер','')
    #my_str2 = 'Собеседник отправил стикер'
    #re.split('Собеседник:\n', my_str2,)[-1]
    
    if message['income'] == 1:
        #text = text[21:]
        text = re.split('Собеседник: ?\n?', text, maxsplit = 1)[-1]
        text = text.replace('Собеседник отправил стикер','')
        if remove_bot_messages:
            text = text.replace('Бот: Мы заботимся о вашей анонимности и не позволяем пересылать сообщения','')
            text = text.replace('Чтобы получить самый интересный опрос, напиши "опрос" или "!"\n? ?Чтобы начать анонимный чат, напиши "чат"\n?','')
    return text

In [3]:
def one_user_json_to_dict_of_dialog_messages(data, work_type='income'):
    """data - json, work_type - income only or all messages"""
    work_type_t = 0
    if (work_type == 'income'): 
        work_type_t = 1
        
    dialogs_ngramms_dict = {}
    user_id = data['user_id']
    dialogs = data['dialogs']
    for d_i, dialog in enumerate(dialogs):
        mline = []
        for m_i, message in enumerate(dialog):
            if message['income'] == work_type_t:
                mline.append(income_message_processing(message))
        #print d_i, messages_line
        messages_line = ''.join(mline)
        norm_message = normalise_document(messages_line)
        dialogs_ngramms_dict[d_i] = norm_message
    return user_id, dialogs_ngramms_dict

In [4]:
def normalise_document(doc):
    """
    Convert document to lower-case and remove accents
    
    Returns:
        A normalised document as unicode
    """
    doc = unicode(doc, 'utf-8')
    return u''.join(c for c in unicodedata.normalize('NFD', doc.lower()) if not unicodedata.combining(c))

In [5]:
def message_line_to_ngramm(message_line, n=3):
    tokenizer = nltk.WordPunctTokenizer()
    for token in tokenizer.tokenize(message_line):
        if len(token) >= n:
            for ngram in nltk.ngrams(token, n):
                yield u"".join(ngram)

In [6]:
def select_features_by_one_dialog(token_freq, top_tokens=10):
    """
    From each language selects top_tokens to be used as features
    Returns:
        set(unicode tokens)
    """
    #features = set()
    sorted_token_freq = sorted(token_freq.iteritems(), key=operator.itemgetter(1), reverse=True)
    #for token, freq in sorted_token_freq[:top_tokens]:
    #    features.add(token)
    return sorted_token_freq[:top_tokens]

In [7]:
def data_to_features(data):
    user_id, dialogs_message_lines = one_user_json_to_dict_of_dialog_messages(data)
    dialog_top = {}
    for d_i, message_line in dialogs_message_lines.items():
        token_freq = {}
        for token in message_line_to_ngramm(message_line):
            token_freq[token] = 1 + token_freq.get(token, 0) 
        dialog_top[d_i] = select_features_by_one_dialog(token_freq)
    return user_id, dialog_top

In [8]:
def data_to_csv(data):
    info_list = []
    user_id, dialog_top = data_to_features(data)
    for d_i, top_tokens in dialog_top.items():
        info_list.append([user_id, d_i, json.dumps(top_tokens)])
        #print top_tokens
    return info_list

In [9]:
input_folder_path = 'json_dialogs'
output_folder_path = 'csv_tokens'
group_id = 145254340

In [10]:
f = open(join(input_folder_path, '{}.json'.format(group_id)))

In [11]:
outfile = open(join(output_folder_path, '{}.csv'.format(group_id)), "wb+")

In [12]:
writer = csv.writer(outfile)

In [13]:
writer.writerow(["user_id", "dialog_number", "top_tokens"])

In [14]:
for line in f:
    data = json.loads(line)
    for item in data_to_csv(data):
        writer.writerow(item)
    #print dialog_top

In [15]:
f.close()

In [16]:
outfile.close()