In [27]:
import json
from pathlib import Path
from typing import Dict, Any, List, Optional, Set

import typer
import pandas as pd


Message = {}
Context = []



def prepare_messages(tg_history_path, output_path,):
    with open(tg_history_path) as messages_file:
        messages = json.load(messages_file)['messages']

    contexts = _create_contexts(messages)
    contexts = _transform_contexts(contexts)

    contexts_df = pd.DataFrame.from_records(contexts)
    contexts_df.drop_duplicates(inplace=True)
    contexts_df.to_csv(output_path, index=False)


def _create_contexts(messages):
    replies_threads = {}
    id_to_message = {}
    for message in messages:
        id_to_message[message['id']] = message
        if 'reply_to_message_id' in message:
            replies_threads[message['reply_to_message_id']] = message['id']

    contexts = []
    cur_context = _create_default_list()
    visited_replies = set()

    for message in messages:
        if (
            message['type'] != 'message' or
            not message['text'] or
            not isinstance(message['text'], str) or
            message['id'] in visited_replies
        ):
            continue

        if 'forwarded_from' in message and cur_context:
            contexts.append(cur_context)
            cur_context = _create_default_list()
            continue

        if message['id'] in replies_threads:
            contexts.append(cur_context)
            cur_context = _create_default_list()
            _resolve_thread(contexts, replies_threads, visited_replies, id_to_message, message)
            continue

        if cur_context[-1] and message['from_id'] == cur_context[-1]['from_id']:
            contexts[-1][-1]['text'] += '\n' + message["text"]
            continue

        cur_context.pop(0)
        cur_context.append(message)
        contexts.append(cur_context.copy())

    return contexts


def _resolve_thread(
    contexts,
    replies_threads,
    visited_replies,
    id_to_message,
    message,
) -> None:
    cur_context = _create_default_list()
    cur_id = message['id']

    while cur_id:
        cur_context.pop(0)
        cur_context.append(id_to_message[cur_id])
        contexts.append(cur_context.copy())

        visited_replies.add(cur_id)
        cur_id = replies_threads.get(cur_id)


def _transform_contexts(contexts):
    return [_transform_context(context) for context in contexts if any(context)]


def _transform_context(context):
    return {
        'context_3': _transform_message(context[0]),
        'context_2': _transform_message(context[1]),
        'context_1': _transform_message(context[2]),
        'response': _transform_message(context[3]),
    }


def _transform_message(message):
    if not message:
        return None

    if isinstance(message['text'], list):
        texts = [text['text'] if isinstance(text, dict) else text for text in message['text']]
        message['text'] = ''.join(texts)

    return message['text']


def _create_default_list(message = ''):
    return [None, None, None, message]




In [163]:
with open(tg_history_path) as messages_file:
        messages = json.load(messages_file)['messages']

In [164]:
bool('s')

True

In [165]:
messages[7]

{'id': 115,
 'type': 'message',
 'date': '2023-03-01T12:44:03',
 'date_unixtime': '1677663843',
 'from': '–ê–Ω—è ‚ù§Ô∏è',
 'from_id': 'user348973081',
 'forwarded_from': '–ê–Ω—è ‚ù§Ô∏è',
 'text': '5. –ö–∞–∫–æ–π –Ω–∞–∏–±–æ–ª–µ–µ —á–∞—Å—Ç–æ –º—É—Ç–∏—Ä—É—é—â–∏–π –≥–µ–Ω –ø—Ä–∏ —Ä–∞–∫–µ? –ö–∞–∫–∞—è —É –Ω–µ–≥–æ —Ñ—É–Ω–∫—Ü–∏—è?\n*',
 'text_entities': [{'type': 'plain',
   'text': '5. –ö–∞–∫–æ–π –Ω–∞–∏–±–æ–ª–µ–µ —á–∞—Å—Ç–æ –º—É—Ç–∏—Ä—É—é—â–∏–π –≥–µ–Ω –ø—Ä–∏ —Ä–∞–∫–µ? –ö–∞–∫–∞—è —É –Ω–µ–≥–æ —Ñ—É–Ω–∫—Ü–∏—è?\n*'}]}

In [92]:
messages[-95]

{'id': 17273,
 'type': 'message',
 'date': '2023-08-24T08:05:19',
 'date_unixtime': '1692853519',
 'from': '–ú–∞–º–∞',
 'from_id': 'user5070028513',
 'text': '–Ø –±–ª–∏–Ω—ã —Å –º—è—Å–æ–º –∏ –∫–æ—Ñ–µ —Å –º–æ–ª–æ–∫–æ–º –±–µ—Ä—É',
 'text_entities': [{'type': 'plain',
   'text': '–Ø –±–ª–∏–Ω—ã —Å –º—è—Å–æ–º –∏ –∫–æ—Ñ–µ —Å –º–æ–ª–æ–∫–æ–º –±–µ—Ä—É'}]}

In [87]:
messages[-92]

{'id': 17281,
 'type': 'message',
 'date': '2023-08-24T11:47:08',
 'date_unixtime': '1692866828',
 'from': 'No',
 'from_id': 'user5982387868',
 'text': '–ú—ã –ø—Ä–∏–ª–µ—Ç–µ–ª–∏',
 'text_entities': [{'type': 'plain', 'text': '–ú—ã –ø—Ä–∏–ª–µ—Ç–µ–ª–∏'}]}

In [86]:
messages[-93]

{'id': 17275,
 'type': 'message',
 'date': '2023-08-24T08:05:27',
 'date_unixtime': '1692853527',
 'from': 'No',
 'from_id': 'user5982387868',
 'reply_to_message_id': 17273,
 'text': '–ù—É –∏–ª–∏ —Ç–∞–∫',
 'text_entities': [{'type': 'plain', 'text': '–ù—É –∏–ª–∏ —Ç–∞–∫'}]}

In [64]:
messages[-100]

{'id': 17239,
 'type': 'message',
 'date': '2023-08-24T07:12:35',
 'date_unixtime': '1692850355',
 'from': '–ú–∞–º–∞',
 'from_id': 'user5070028513',
 'file': '(File not included. Change data exporting settings to download.)',
 'thumbnail': '(File not included. Change data exporting settings to download.)',
 'media_type': 'sticker',
 'sticker_emoji': 'üòò',
 'width': 512,
 'height': 512,
 'text': '',
 'text_entities': []}

In [175]:
messages[83]

{'id': 194,
 'type': 'message',
 'date': '2023-03-13T13:12:06',
 'date_unixtime': '1678702326',
 'from': 'No',
 'from_id': 'user5982387868',
 'text': '–Ω–∞ 6 —Ç–æ—á–Ω–æ –Ω–∞–ø–∏—à—É',
 'text_entities': [{'type': 'plain', 'text': '–Ω–∞ 6 —Ç–æ—á–Ω–æ –Ω–∞–ø–∏—à—É'}]}

In [76]:
import re 

st = '''['Watch "–ö–∞–∫ –æ—Ç—Å—Ä–æ—á–∏—Ç—å —Å—Ç–∞—Ä–æ—Å—Ç—å? –†–∞—Å—Å–∫–∞–∑—ã–≤–∞–µ—Ç —ç–Ω–¥–æ–∫—Ä–∏–Ω–æ–ª–æ–≥" on YouTube\n', {'type': 'link', 'text': 'https://youtu.be/imgZSjmO4BY'}, '']'''
print(re.search("(?P<url>https?://[^\s]+)", st))

<re.Match object; span=(100, 131), match="https://youtu.be/imgZSjmO4BY'},">


In [114]:
len('''–Ω—Ç–µ—Ä–µ—Å –∫ –º–∞—à–∏–Ω–Ω–æ–º—É –æ–±—É—á–µ–Ω–∏—é —É –º–µ–Ω—è –ø–æ—è–≤–∏–ª—Å—è, –∫–æ–≥–¥–∞ —è —É–∑–Ω–∞–ª–∞ –ø—Ä–æ NLP. 
–°–Ω–∞—á–∞–ª–∞ —è —É—á–∏–ª–∞—Å—å –Ω–∞ —ç–∫–æ–Ω–æ–º–∏–∫–µ –∏ –≤—ã–±—Ä–∞–ª–∞ –≤ –∫–∞—á–µ—Å—Ç–≤–µ –º–∞–π–Ω–æ—Ä–∞ –ò–ê–î, –Ω–æ –º–Ω–µ —Ö–æ—Ç–µ–ª–æ—Å—å –±–æ–ª—å—à–µ –ø–æ–≥—Ä—É–∑–∏—Ç—å—Å—è –≤ Computer Science, –ø–æ—ç—Ç–æ–º—É —è –ø–µ—Ä–µ–ø–æ—Å—Ç—É–ø–∏–ª–∞ –Ω–∞ –ü–ú–ò –≤ –∫–æ–Ω—Ü–µ 1 –∫—É—Ä—Å–∞. –í –º–∞—Ä—Ç–µ —è –±—ã–ª–∞ –Ω–∞ –®–∫–æ–ª–µ –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –ò–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞ –ú–¢–°. –¢–∞–º —è –ø–æ—Å–µ—Ç–∏–ª–∞ –ª–µ–∫—Ü–∏—é –°–µ—Ä–≥–µ—è –ó–∞–≥–æ—Ä—É–π–∫–æ, –≥–¥–µ –æ–Ω —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–ª –æ —Å–≤–æ–µ–π –∫–∞—Ä—å–µ—Ä–µ –∏ –∑–∞–¥–∞—á–∞—Ö, –∫–æ—Ç–æ—Ä—ã–µ –æ–Ω —Ä–µ—à–∞–ª: 3D —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏—è, –¥–µ—Ç–µ–∫—Ü–∏—è –æ–±—ä–µ–∫—Ç–æ–≤, –±–µ—Å–ø–∏–ª–æ—Ç–Ω—ã–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏; –∞ —Ç–∞–∫–∂–µ –æ —Ç–æ–º, –∫–∞–∫ —Ä–∞–∑–≤–∏–≤–∞–ª–∏—Å—å —ç—Ç–∏ –æ–±–ª–∞—Å—Ç–∏ –ø–æ—Å–ª–µ–¥–Ω–∏–µ 10 –ª–µ—Ç. –ï–≥–æ —Ä–∞—Å—Å–∫–∞–∑ –æ—á–µ–Ω—å –º–µ–Ω—è –≤–ø–µ—á–∞—Ç–ª–∏–ª. –ú–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è, —á—Ç–æ –≤ –º–∞—à–∏–Ω–Ω–æ–º –æ–±—É—á–µ–Ω–∏–∏ –µ—Å—Ç—å –º–Ω–æ–≥–æ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã—Ö –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–π,–∫–æ—Ç–æ—Ä—ã–µ –ø–æ–∑–≤–æ–ª—è—é—Ç —Ä–µ—à–∞—Ç—å –±–æ–ª—å—à–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∑–∞–¥–∞—á –Ω–∞ —Å—Ç—ã–∫–µ —Ä–∞–∑–Ω—ã—Ö –æ–±–ª–∞—Å—Ç–µ–π. –ú–Ω–µ –±—ã–ª–æ –±—ã –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ –≤ –±—É–¥—É—â–µ–º –ø–æ—Ä–∞–±–æ—Ç–∞—Ç—å –≤ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Å—Ñ–µ—Ä–∞—Ö: –æ–±—Ä–∞–±–æ—Ç–∫–µ –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ —è–∑—ã–∫–∞, –±–∏–æ–∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–∫–µ, —á–µ–º-—Ç–æ, —Å–≤—è–∑–∞–Ω–Ω–æ–º —Å –∫–æ–º–ø—å—é—Ç–µ—Ä–Ω—ã–º –∑—Ä–µ–Ω–∏–µ–º. –ù–∞ –ú–û–ü–µ –µ—Å—Ç—å –∫—É—Ä—Å—ã –ø–æ –≤—ã–±–æ—Ä—É –ø–æ –≤—Å–µ–º —ç—Ç–∏–º —Ç–µ–º–∞–º, –≥–¥–µ —è —Å–º–æ–≥—É –ø–æ–ª—É—á—à–µ —É–∑–Ω–∞—Ç—å –∫–∞–∂–¥—É—é –∏–∑ –Ω–∏—Ö –∏ –ø–æ–Ω—è—Ç—å, —á—Ç–æ –º–Ω–µ –±–ª–∏–∂–µ
–ê–Ω—è ‚ù§Ô∏è:–ø–µ—Ä–µ–ø–∏—Å–∞–ª–∞ –∫—É—Å–æ–∫ –º–æ—Ç–∏–≤–∞—à–∫–∏ –ø–æ—Å–º–æ—Ç—Ä–∏ –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ü•∫ü•∫ü•∫''')

1024

In [1]:
# from datetime import datetime, timedelta
# import json

# def process_chat(chat_data):
#     # –°–æ—Ä—Ç–∏—Ä–æ–≤–∫–∞ —Å–æ–æ–±—â–µ–Ω–∏–π –ø–æ –¥–∞—Ç–µ
#     sorted_chat = sorted(chat_data, key=lambda x: x['date_unixtime'])

#     dialogs = []
#     current_dialog = ['']
#     prev_time = None

#     for message in sorted_chat:
#         if message['from'] == 'No':
#             # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –¥–∞—Ç—ã –∏–∑ —Å—Ç—Ä–æ–∫–∏ –≤ –æ–±—ä–µ–∫—Ç datetime
#             message_time = datetime.strptime(message['date'], '%Y-%m-%dT%H:%M:%S')

#             if prev_time is None or (message_time - prev_time) <= timedelta(hours=2):
#                 current_dialog.append(message['text'])
#             else:
#                 if current_dialog:
#                     dialogs.append(' '.join(current_dialog))
#                 current_dialog = [message['text']]

#             prev_time = message_time

#     if current_dialog:
#         dialogs.append(' '.join(current_dialog))

#     return dialogs

# # –ü—Ä–∏–º–µ—Ä –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
# input_data = [
#     # ... (–≤–∞—à –≤—Ö–æ–¥–Ω–æ–π json –∑–¥–µ—Å—å)
# ]

# dialogs = process_chat(messages)

# for i, dialog in enumerate(dialogs, start=1):
#     print(f"Dialog {i}: {dialog}\n")


In [212]:
messages

[{'id': 101,
  'type': 'message',
  'date': '2023-02-16T18:32:17',
  'date_unixtime': '1676561537',
  'from': '–ê–Ω—è ‚ù§Ô∏è',
  'from_id': 'user348973081',
  'text': '–ù–∏–∫–∏—Ç?',
  'text_entities': [{'type': 'plain', 'text': '–ù–∏–∫–∏—Ç?'}]},
 {'id': 102,
  'type': 'message',
  'date': '2023-02-20T22:12:28',
  'date_unixtime': '1676920348',
  'from': 'No',
  'from_id': 'user5982387868',
  'forwarded_from': 'Other',
  'text': '–ü–æ—á–µ–º—É —è —Ö–æ—á—É –ø–æ–ø–∞—Å—Ç—å –Ω–∞ —à–∫–æ–ª—É –ø–æ –ò–ò –æ—Ç –ú–¢–°?\n\n–ò–ò —è –Ω–∞—á–∞–ª —É–≤–ª–µ–∫–∞—Ç—å—Å—è –µ—â–µ –≤ —à–∫–æ–ª–µ, —Ç–æ–≥–¥–∞ —è —Å–º–æ—Ç—Ä–µ–ª –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ –ª–µ–∫—Ü–∏–∏ –º–∞–ª–æ–≥–æ –®–ê–î–∞ –∏ –≤—ã—Å—Ç—É–ø–ª–µ–Ω–∏—è –ê–Ω–¥—Ä–µ—è –°–µ—Ä–±–∞–Ω—Ç–∞, –ø–æ—Ç–æ–º —è —Ä–µ—à–∏–ª –∏–∑—É—á–∞—Ç—å –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ —Å–∏, –≤ –≤—É–∑–µ –æ—Å–≤–æ–∏–ª –ø–∏—Ç–æ–Ω, –ø—Ä–æ—à–µ–ª –∫—É—Ä—Å—ã –ø–æ —Ç–µ–æ—Ä–≤–µ—Ä—É –∏ –º–∞—Ç—Å—Ç–∞—Ç—É –æ—Ç CSC, –≤ —Å–µ–Ω—Ç—è–±—Ä–µ –Ω–∞—á–∞–ª –∑–∞–Ω–∏–º–∞—Ç—å—Å—è –ø—Ä–æ–¥—É–∫—Ç–æ–≤–æ–π –∞–Ω–∞–ª–

In [53]:
import json
from pathlib import Path


from datetime import datetime

def get_dialogues(dump_path, MAX_CONTEXT_LEN = 1200, MAX_TEXT_LEN = 1200, MY_NAME = 'No'):

    with open(dump_path) as messages_file:
            messages = json.load(messages_file)['messages']

            
    dialogues = []
    last_post_date = None
    last_sender = None
    
    curr_input = ''
    curr_output = ''
    curr_context = ''
    
    print(len(messages))

    for message in messages:
        #remove forwarded messages
        if message.get('forwarded_from', None):
            continue
        
        if message.get('text', None):
            txt = message.get('text', None)
            if len(txt) > MAX_TEXT_LEN:
                continue 
            if re.search("(?P<url>https?://[^\s]+)", str(txt)): # remove links 
                continue   
        elif message.get('sticker_emoji', None):
            txt = message.get('sticker_emoji', None)
        else:
            continue
            
        curr_date = datetime.strptime(message.get('date', None),'%Y-%m-%dT%H:%M:%S') 
        curr_sender = message.get('from', None)
        
        if curr_date is None or curr_sender is None:
            continue 
        
        if last_post_date == None:
            last_post_date = curr_date
        if last_sender == None:
            last_sender = curr_sender
        
        if curr_sender == MY_NAME:
            curr_output = curr_output + str(txt) + ' \n '
        else:
            curr_input = curr_input + str(txt) + ' \n '
                
                
        if curr_sender != last_sender:
            dialogues.append({'context':curr_context, 'input':curr_input, 'output':curr_output})
            if len(curr_input) != 0:
                curr_context += '–ê–ù–Ø:' + curr_input
            if len(curr_output) != 0:
                curr_context += '–ù–ò–ö–ò–¢–ê:' + curr_output
            curr_input = ''
            curr_output = ''
            

        if len(curr_context) > MAX_CONTEXT_LEN:
            curr_context = ''
        
        if (curr_date - last_post_date).seconds//3600 > 2: 
            
            last_post_date = None
            last_sender = None
            
            curr_input = ''
            curr_output = ''
            curr_context = ''  
        
        last_sender = curr_sender


        # TO-DO: catch reply_to_message_id
        
    return dialogues

In [35]:
check_message({'id': 111,
  'type': 'message',
  'date': '2023-02-27T22:33:22',
  'date_unixtime': '1677526402',
  'from': '–ê–Ω—è ‚ù§Ô∏è',
  'from_id': 'user348973081',
  'text': [{'type': 'link',
    'text': 'https://docviewer.yandex.ru/view/1049160442/?page=3&*=3Z%2BxL2ib57dQlFuYyC2fnhrzuXh7InVybCI6InlhLWRpc2stcHVibGljOi8vNm5TbGZhdDJuaHlsUko2UDU5M01SczRBK0tuaWZaOUNVRDJLSTM0NnUzdXFCbXZjazJ0OGNXYytzZGU4dnBWb3EvSjZicG1SeU9Kb25UM1ZvWG5EYWc9PTovbGVjdHVyZV9ub3Rlcy9sZWN0dXJlNV9mbWF0Y29tcDIzLnBkZiIsInRpdGxlIjoibGVjdHVyZTVfZm1hdGNvbXAyMy5wZGYiLCJub2lmcmFtZSI6ZmFsc2UsInVpZCI6IjEwNDkxNjA0NDIiLCJ0cyI6MTY3NzUyNjEyMTEwMywieXUiOiI0OTkxMzczOTkxNjU5MDMwMDAzIn0%3D'}],
  'text_entities': [{'type': 'link',
    'text': 'https://docviewer.yandex.ru/view/1049160442/?page=3&*=3Z%2BxL2ib57dQlFuYyC2fnhrzuXh7InVybCI6InlhLWRpc2stcHVibGljOi8vNm5TbGZhdDJuaHlsUko2UDU5M01SczRBK0tuaWZaOUNVRDJLSTM0NnUzdXFCbXZjazJ0OGNXYytzZGU4dnBWb3EvSjZicG1SeU9Kb25UM1ZvWG5EYWc9PTovbGVjdHVyZV9ub3Rlcy9sZWN0dXJlNV9mbWF0Y29tcDIzLnBkZiIsInRpdGxlIjoibGVjdHVyZTVfZm1hdGNvbXAyMy5wZGYiLCJub2lmcmFtZSI6ZmFsc2UsInVpZCI6IjEwNDkxNjA0NDIiLCJ0cyI6MTY3NzUyNjEyMTEwMywieXUiOiI0OTkxMzczOTkxNjU5MDMwMDAzIn0%3D'}]}, 1000)

False

In [241]:
re.search("(?P<url>https?://[^\s]+)", '''[{'type': 'link',
   'text': 'https://docviewer.yandex.ru/view/1049160442/?page=3&*=3Z%2BxL2ib57dQlFuYyC2fnhrzuXh7InVybCI6InlhLWRpc2stcHVibGljOi8vNm5TbGZhdDJuaHlsUko2UDU5M01SczRBK0tuaWZaOUNVRDJLSTM0NnUzdXFCbXZjazJ0OGNXYytzZGU4dnBWb3EvSjZicG1SeU9Kb25UM1ZvWG5EYWc9PTovbGVjdHVyZV9ub3Rlcy9sZWN0dXJlNV9mbWF0Y29tcDIzLnBkZiIsInRpdGxlIjoibGVjdHVyZTVfZm1hdGNvbXAyMy5wZGYiLCJub2lmcmFtZSI6ZmFsc2UsInVpZCI6IjEwNDkxNjA0NDIiLCJ0cyI6MTY3NzUyNjEyMTEwMywieXUiOiI0OTkxMzczOTkxNjU5MDMwMDAzIn0%3D'}]''')

<re.Match object; span=(30, 479), match="https://docviewer.yandex.ru/view/1049160442/?page>

In [275]:
s= '''8 15\n\\             /\n.|           |.\n.|           |.\n..\\         /..\n...|       |...\n...|       |...\n....\\     /....\n.....\\___/.....\n2\ngin 2 %\ntonic 4 * \n 8 15\n\\             /\n.|           |.\n.|           |.\n..\\         /..\n...|       |...\n...|       |...\n....\\     /....\n.....\\___/.....\n2\ngin 2 %\ntonic 4 * \n \\             /\n.|***********|.\n.|***********|.\n..\\*********/..\n...|*******|...\n...|%%%%%%%|...\n....\\%%%%%/....\n.....\\___/..... \n \\             /\n.|***********|.\n.|***********|.\n..\\*********/..\n...|*******|...\n...|%%%%%%%|...\n....\\%%%%%/....\n.....\\___/..... \n '},
 {'context': '–ê–ù–Ø:üòò \n –û–∫–µ –ø—Ä–∏–¥—É —á–µ—Ä–µ–∑ 5 –º–∏–Ω—É—Ç \n –ù–ò–ö–ò–¢–ê:8 15\n\\             /\n.|           |.\n.|           |.\n..\\         /..\n...|       |...\n...|       |...\n....\\     /....\n.....\\___/.....\n2\ngin 2 %\ntonic 4 * \n 8 15\n\\             /\n.|           |.\n.|           |.\n..\\         /..\n...|       |...\n...|       |...\n....\\     /....\n.....\\___/.....\n2\ngin 2 %\ntonic 4 * \n \\             /\n.|***********|.\n.|***********|.\n..\\*********/..\n...|*******|...\n...|%%%%%%%|...\n....\\%%%%%/....\n.....\\___/..... \n \\             /\n.|***********|.\n.|***********|.\n..\\*********/..\n...|*******|...\n...|%%%%%%%|...\n....\\%%%%%/....\n.....\\___/..... \n '''

In [277]:
count_alphabetic_and_emojis(s)/len(s)

0.05587989991659716

In [364]:
import json
from pathlib import Path
import re 
import emoji

from datetime import datetime

def count_alphabetic(input_string):
    count = 0
    
    for char in input_string:
        if char.isalpha():
            count += 1
    return count

def count_emojis(input_string):
    count = 0
    
    for char in input_string:
        if emoji.EMOJI_DATA:
            count += 1
    return count

def check_message(message, MAX_TEXT_LEN, symbols_treshold = 0.5):
    #remove forwarded messa
    if message.get('forwarded_from', None):
        return False
    
    if message.get('text', None):
        text = message.get('text', None)
        if len(text) > MAX_TEXT_LEN:
            return False 
        if re.search("(?P<url>https?://[^\s]+)", str(text)): # remove links 
            return False   
        alph_num = count_alphabetic(str(text))
        emoji_num = count_emojis(str(text))
        
        if (alph_num)/len(str(text)) < symbols_treshold:
            return False
    elif message.get('sticker_emoji', None):
        text = message.get('sticker_emoji', None)
    else:
        return False
        
    curr_date = datetime.strptime(message.get('date', None),'%Y-%m-%dT%H:%M:%S') 
    curr_sender = message.get('from_id', None)
    
    if curr_date is None or curr_sender is None:
        return False 
    
    return {'text':text, 'curr_date':curr_date, 'curr_sender':curr_sender}
    
    


def get_dialogues(dump_path, MAX_CONTEXT_LEN = 200, MAX_TEXT_LEN = 180, my_ids = ['user5982387868', 'user348898603'], prompt = ''): 
    # MAX_CONTEXT_LEN = 500, MAX_TEXT_LEN = 200,

    with open(dump_path) as messages_file:
            messages = json.load(messages_file)['messages']

            
    dialogues = []
    last_post_date = None
    last_sender = None
    last_anna_text = ''
    
    my_text = ''
    anna_text = ''
    curr_context = ''
    
    for message in messages:
        
        message = check_message(message, MAX_TEXT_LEN)
        if not message:
            continue
        else:
            curr_date = message['curr_date']
            curr_sender = message['curr_sender']
            curr_text = message['text']
        
        if last_post_date == None:
            last_post_date = curr_date
        if last_sender == None:
            last_sender = curr_sender
        
        
        if curr_sender != last_sender and curr_sender not in my_ids:
            #dialogues.append({'context':curr_context, 'instruction':anna_text, 'response':my_text})
            if len(anna_text) != 0:
                curr_context += ' –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:' + anna_text + '\n'
            dialogues.append({'context':curr_context, 'response':my_text})
            if len(my_text) != 0:
                curr_context += ' –¢—ã:' + my_text + '\n'
                
            last_anna_text = anna_text
            anna_text = ''
            my_text = ''
        
        if curr_sender in my_ids:
            my_text = my_text + str(curr_text) + ' \n'
        else:
            anna_text = anna_text + str(curr_text) + ' \n'

        
        if len(curr_context) > MAX_CONTEXT_LEN:
            curr_context = ''
        
        if (curr_date - last_post_date).seconds//3600 > 2: 
            
            last_post_date = None
            last_sender = None
            
            curr_input = ''
            curr_output = ''
            curr_context = ''  
        
        last_sender = curr_sender
        last_post_date = curr_date

        # TO-DO: catch reply_to_message_id
        
    return dialogues

In [365]:
import os
import glob

def find_json_files(folder_path, exceptions = ['']):
    json_files = []
    
    search_pattern = os.path.join(folder_path, '**', '*.json')
    
    for file_path in glob.iglob(search_pattern, recursive=True):
        if os.path.isfile(file_path) and file_path not in exceptions:
            json_files.append(file_path)
    
    return json_files

folder_path = '/home/box/digital_clone/dialogues'
#exceptions = ['/home/box/digital_clone/dialogues/tg_dialogues_with_anna_account_1.json']
exceptions = ['']

json_paths = find_json_files(folder_path, exceptions = exceptions)

if json_paths:
    print('-' * 110 )
    print('PARSED_PATHS:')
    print('-' * 110 )
    for json_file in json_paths:
        print(json_file)
else:
    print("There is no json files")

--------------------------------------------------------------------------------------------------------------
PARSED_PATHS:
--------------------------------------------------------------------------------------------------------------
/home/box/digital_clone/dialogues/m_1.json
/home/box/digital_clone/dialogues/anastasia.json
/home/box/digital_clone/dialogues/colya.json
/home/box/digital_clone/dialogues/dasha_hse.json
/home/box/digital_clone/dialogues/genyaa.json
/home/box/digital_clone/dialogues/mama.json
/home/box/digital_clone/dialogues/tg_dialogues_with_anna_account_2.json
/home/box/digital_clone/dialogues/misha.json
/home/box/digital_clone/dialogues/zloy_chelovek.json
/home/box/digital_clone/dialogues/nikita.json
/home/box/digital_clone/dialogues/alan.json
/home/box/digital_clone/dialogues/ekaterina.json
/home/box/digital_clone/dialogues/vova.json
/home/box/digital_clone/dialogues/dima.json
/home/box/digital_clone/dialogues/tg_dialogues_with_anna_account_2_2.json


In [366]:
# prompt = '''–¢–µ–±—è –∑–æ–≤—É—Ç –ù–∏–∫–∏—Ç–∞. –¢–µ–±–µ 21 –≥–æ–¥. –£ —Ç–µ–±—è –µ—Å—Ç—å –¥–µ–≤—É—à–∫–∞ –ê–Ω—è, –≤—ã –≤—Å—Ç—Ä–µ—á–∞–µ—Ç–µ—Å—å 2 –≥–æ–¥–∞ –∏ —Ç—ã –µ–µ –æ—á–µ–Ω—å –ª—é–±–∏—à—å. 
# –¢—ã –æ–±—â–∞–µ—à—å—Å—è —Å –ê–Ω–µ–π, –ø—Ä–æ–¥–æ–ª–∂–∏ –¥–∏–∞–ª–æ–≥: '''

all_dialogues = []


for path in json_paths:
    # –¥–∏–∞–ª–æ–≥–∏ —Å —Ä–∞–∑–Ω—ã—Ö –∞–∫–∫–∞—É–Ω—Ç–æ–≤ —Ç–≥
    dialogues = get_dialogues(path)
    
    if path == '/home/box/digital_clone/dialogues/tg_dialogues_with_anna_account_2_2.json':
        all_dialogues += dialogues[:5000]
    else:
        all_dialogues += dialogues
    print(len(dialogues))

235
353
79
2655
60
897
3475
5863
344
106
452
140
155
1214
23032


In [371]:
len(all_dialogues)

21028

In [372]:
all_dialogues

[{'context': ' –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:–ü–æ—á–µ–º—É —Ç—ã —Ç—É—Ç –≤ —Å–µ—Ç–∏ –±—ã–ª \n–ê –≤ –≤–∫ –Ω–µ—Ç? \n\n',
  'response': '–Ø —Å–ª—É—á–∞–π–Ω–æ –∑–∞—à–µ–ª –≤ —Ç–µ–ª–µ–≥—É ... \n'},
 {'context': ' –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:–ü–æ—á–µ–º—É —Ç—ã —Ç—É—Ç –≤ —Å–µ—Ç–∏ –±—ã–ª \n–ê –≤ –≤–∫ –Ω–µ—Ç? \n\n –¢—ã:–Ø —Å–ª—É—á–∞–π–Ω–æ –∑–∞—à–µ–ª –≤ —Ç–µ–ª–µ–≥—É ... \n\n –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:üòÑ \n\n',
  'response': '–ê–æ–∞–æ–∞–æ–∞–æ–∞–æ–∞ \n'},
 {'context': ' –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:–ü–æ—á–µ–º—É —Ç—ã —Ç—É—Ç –≤ —Å–µ—Ç–∏ –±—ã–ª \n–ê –≤ –≤–∫ –Ω–µ—Ç? \n\n –¢—ã:–Ø —Å–ª—É—á–∞–π–Ω–æ –∑–∞—à–µ–ª –≤ —Ç–µ–ª–µ–≥—É ... \n\n –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:üòÑ \n\n –¢—ã:–ê–æ–∞–æ–∞–æ–∞–æ–∞–æ–∞ \n\n –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:üò¥ \n\n',
  'response': '–ù–∞ –∫–∞–∫–æ–π –∞–∫–∫ ? \n'},
 {'context': ' –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:–ü–æ—á–µ–º—É —Ç—ã —Ç—É—Ç –≤ —Å–µ—Ç–∏ –±—ã–ª \n–ê –≤ –≤–∫ –Ω–µ—Ç? \n\n –¢—ã:–Ø —Å–ª—É—á–∞–π–Ω–æ –∑–∞—à–µ–ª –≤ —Ç–µ–ª–µ–≥—É ... \n\n –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:üòÑ \n\n –¢—ã:–ê–æ–∞–æ–∞–æ–∞–æ–∞–æ–∞ \n\n –°–æ–±–µ—Å–µ–¥–Ω–∏–∫:üò¥ \n\n –¢—ã

In [374]:
with open('./all_dialogues.json', 'w', encoding = 'utf8') as json_file:
    json.dump(all_dialogues, json_file, indent=4, ensure_ascii=False)