In [1]:
import os
import pandas as pd
import glob

In [2]:
DIALOGS_DATA_PATH = "../data/dialogs"
DIALOGS_META_DATA_PATH = "../data/dialogs_meta"

MERGED_DATA_PATH = "../data/merged_data"

In [3]:
!ls ../data/

[1m[36mdialogs[m[m      [1m[36mdialogs_meta[m[m [1m[36mmerged_data[m[m


In [4]:
dialogs_data_files = glob.glob(f"{DIALOGS_DATA_PATH}/*.csv")
dialogs_meta_data_files = glob.glob(f"{DIALOGS_META_DATA_PATH}/*.json")

## merging all dialogs_data (csv files) into one

In [5]:
df_array = []

for d in dialogs_data_files:
    local_df = pd.read_csv(d)
    local_df["dialog_id"] = os.path.basename(d).split(".")[0]
    
    df_array.append(local_df)
    
df = pd.concat(df_array, ignore_index=True)

In [6]:
df.shape

(2576905, 10)

In [7]:
if not os.path.isdir(MERGED_DATA_PATH):
    os.mkdir(MERGED_DATA_PATH)

if "Unnamed: 0" in df:
    df = df.drop(["Unnamed: 0"], axis=True)
    
df.to_csv(f"{MERGED_DATA_PATH}/dialogs_data_all.csv", index=False)

In [8]:
df.shape

(2576905, 9)

In [9]:
df.head(10)

Unnamed: 0,id,date,from_id,to_id,fwd_from,message,type,duration,dialog_id
0,879,2021-04-29 09:10:56+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,дяки ❤️,text,,-1001480379241
1,878,2021-04-29 09:09:44+00:00,PeerUser(user_id=281396127),PeerChannel(channel_id=1480379241),,Просто чаще всего добавляются неофициальные и ...,text,,-1001480379241
2,877,2021-04-29 09:09:17+00:00,PeerUser(user_id=281396127),PeerChannel(channel_id=1480379241),,"Привет, вроде как да",text,,-1001480379241
3,876,2021-04-29 09:05:30+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,чи інша кількість ?,text,,-1001480379241
4,875,2021-04-29 09:05:24+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,таке запитаннячко: офіційно ж можливо дві пере...,text,,-1001480379241
5,874,2021-04-29 09:05:06+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,Привітики 👋,text,,-1001480379241
6,873,2021-02-26 10:52:19+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,так,text,,-1001480379241
7,872,2021-02-26 10:45:56+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,вона тоді б з бюджету злетіла,text,,-1001480379241
8,871,2021-02-26 10:45:37+00:00,PeerUser(user_id=383618515),PeerChannel(channel_id=1480379241),,немає,text,,-1001480379241
9,870,2021-02-26 09:02:57+00:00,PeerUser(user_id=61792181),PeerChannel(channel_id=1480379241),,Може академзаборгованість?,text,,-1001480379241


In [10]:
min(df["date"]),max(df["date"])

('2016-02-01 13:34:12+00:00', '2022-11-01 01:01:34+00:00')

In [11]:
df.groupby(["type"])["type"].count()

type
photo       260436
sticker      47869
text       2216315
video        43491
voice         8794
Name: type, dtype: int64

In [12]:
df.groupby(["type"])["duration"].sum()

type
photo            0.0
sticker          0.0
text             0.0
video      2533484.0
voice       228398.0
Name: duration, dtype: float64

## merging all dialogs_meta_data (json files) into one

In [13]:
df_array = []

for d in dialogs_meta_data_files:
    local_df = pd.read_json(d)
    local_df = local_df.rename({'id': 'dialog_id'}, axis=1)
    df_array.append(local_df)
    
df_meta = pd.concat(df_array, ignore_index=True)

In [14]:
df_meta.to_csv(f"{MERGED_DATA_PATH}/dialogs_users_all.csv", index=False)

In [15]:
df_meta.shape

(11868, 4)

In [16]:
df_meta.head(10)

Unnamed: 0,dialog_id,name,type,users
0,-280495890,Дозор 32-го созива,Group,"{'user_id': 255026222, 'first_name': 'Kyrylo',..."
1,-280495890,Дозор 32-го созива,Group,"{'user_id': 403299058, 'first_name': 'Vladysla..."
2,-280495890,Дозор 32-го созива,Group,"{'user_id': 365047140, 'first_name': 'Жёнушка'..."
3,-280495890,Дозор 32-го созива,Group,"{'user_id': 362015063, 'first_name': 'Маленька..."
4,-280495890,Дозор 32-го созива,Group,"{'user_id': 281396127, 'first_name': 'Обережно..."
5,-280495890,Дозор 32-го созива,Group,"{'user_id': 382015004, 'first_name': 'Сальпака..."
6,506161961,Vladyslav Matus,Private dialog,"{'user_id': 506161961, 'first_name': 'Vladysla..."
7,-1001695031361,Могилянський Кібер-плац,Group,"{'user_id': 255144521, 'first_name': 'Трон', '..."
8,-1001695031361,Могилянський Кібер-плац,Group,"{'user_id': 473669329, 'first_name': 'Кіпішна ..."
9,-1001695031361,Могилянський Кібер-плац,Group,"{'user_id': 670322433, 'first_name': 'Сах', 'l..."


In [17]:
df_meta.groupby(["type"])["type"].count()

type
Channel             192
Group             11167
Private dialog      509
Name: type, dtype: int64