### Loading the data from JSON files to a Pandas DF

In [27]:
#import neccessary packages
import pandas as pd
import os
from fbjson2table.func_lib import parse_fb_json
from fbjson2table.table_class import TempDFs
from enchant.checker import SpellChecker




In [16]:

def sort_key_files(file):
    return int(file.split('_')[1].split('.')[0])

def load_all_messages_in_folder(path):
    """
    This function expects to get a folder path with files named messages_x.json where x is an integer
    it will load them in order and returns a pandas dataframe
    """
    merged_messages=pd.DataFrame()
    arr = os.listdir(path)
    arr.sort(key=sort_key_files)
    print(arr)
    for file in arr:
        json_content=parse_fb_json(path+'/'+file)
        temp_dfs = TempDFs(json_content)
        messages=temp_dfs.df_list[2]
        merged_messages=pd.concat([merged_messages,messages])
    return merged_messages 

In [47]:


def is_in_english(quote,max_error_count = 3):
  """
  detects if a text is in english using the number of spelling mistakes
  """
  d = SpellChecker("en_US")
  d.set_text(quote)
  errors = [err.word for err in d]
  return False if ((len(errors) > max_error_count)) else True

In [65]:
def clean_messages(df,yourname):
    #remove rows with null content
    df=df.dropna(subset=['content'])

    #Facebook also keeps track of reactions in rows that will start with "Reacted", we are deleting those
    df['react']=df['content'].apply(lambda s: str(s).split(' ')[0])
    df=df[df['react']!='Reacted']


    #uncomment this if you want only english messages
    # df['english']=df.content.apply(lambda s : is_in_english(str(s)))
    # df=df[df['english']==True]
    
    df=df[df['type']=='Generic']
    df=df[~df['content'].str.startswith('You changed the group')]
    df=df[~df['content'].str.startswith('You named the group')]
    df['sender_name']=df['sender_name'].apply(lambda s : yourname if s==yourname else 'Person')
    df=df[['content','sender_name']]
    
    return df

In [66]:
def load_all_folders(folders):
    """
    Given a list of folders, load all messages in folders and return a single dataframe
    """
    df_all_messages=pd.DataFrame()
    for folder in folders:
        extracted_data=load_all_messages_in_folder(f'./data/{folder}')
        df_all_messages=pd.concat([extracted_data,df_all_messages])
    return df_all_messages


In [67]:
def to_document(df):
    """Returns a single string in the format Sender : Message \n Sender: Message """
    
    df['content_with_names']=df['sender_name']+":"+df['content']
    result = '\n'.join(df['content_with_names'].values)
    return result

In [70]:
#replace with appropriate paths
folders=['inwi','seller']
df=load_all_folders(folders)
clean_df=clean_messages(df,'Tariq Massaoudi')
with open("disscussion.txt", mode='w', encoding='utf-8') as file:
    file.write(to_document(clean_df))


['message_1.json']
['message_1.json']


In [58]:
m

Unnamed: 0,id_0,id_messages_1,sender_name,timestamp_ms,content,type,is_unsent,is_taken_down
0,0,0,Seller,1564428506222,Soufiane is waiting for your response about th...,Generic,False,False
1,0,1,Seller,1564420905218,Deja vendu,Generic,False,False
2,0,2,Tariq Massaoudi,1564420302377,salam soufian je suis interessé par le pixel 3,Generic,False,False
3,0,3,Tariq Massaoudi,1564420300269,You changed the group photo.,Generic,False,False
4,0,4,Tariq Massaoudi,1564420299716,You named the group Tariq · Google pixel 3 éta...,Generic,False,False
0,0,0,inwi,1585650833201,"Bonjour Tariq, j’espère que vous vous portez b...",Generic,False,False
1,0,1,Tariq Massaoudi,1585578647702,comment bénéficier de l'offre x15?,Generic,False,False
2,0,2,inwi,1585578628927,"Bonjour Tariq , nous venons de prendre connais...",Generic,False,False
3,0,3,Tariq Massaoudi,1585578628030,bonjour inwi,Generic,False,False
