# Parser for Instagram Loader Files

This script gets all the files that have a .json extension in the same folder, and generates one pandas dataframe for posts, and another for images.

**IMPORTANT:** InstagramLoader seems to save the .json files compressed (.xz format). You need to extract these files first. You could use the Unarchiver (for mac) for this.

In [None]:
import pandas as pd 
import os
import json
import sys
import datetime

In [None]:
def parse_comments(id_alt,comment_edges,post_id=None):
    comments = []
    for item in comment_edges:
        r = {}
        if post_id:
            r['post_comment_id'] = post_id
        else:
            r['post_comment_id'] = "".join(id_alt.split("_")[:2]).replace("-","")
        for key, value in item.items():
            if type(item[key]) == dict:
                for subkey, subvalue in item[key].items():
                    if type(item[key][subkey]) == dict:
                        for subsubkey, subsubvalue in item[key][subkey].items():
                            r[str(key)+'_'+str(subkey)+'_'+str(subsubkey)] = item[key][subkey][subsubkey]
                    else:
                        r[str(key)+'_'+str(subkey)] = item[key][subkey]
            else:
                r[key] = value
        comments.append(r)
    return pd.DataFrame(comments)
                

In [None]:
def parse_post(item, id_alt):
    post = []
    r = {}
    for key, value in item.items():
        r['post_comment_id'] = "".join(id_alt.split("_")[:2]).replace("-","")
        if type(item[key]) == dict:
            for subkey, subvalue in item[key].items():
                if type(item[key][subkey]) == dict:
                    for subsubkey, subsubvalue in item[key][subkey].items():
                        if type(item[key][subkey][subsubkey]) == dict:
                            for subsubsubkey, subsubsubvalue in item[key][subkey][subsubkey].items():
                                r[str(key)+'_'+str(subkey)+'_'+str(subsubkey)+'_'+str(subsubsubkey)] = item[key][subkey][subsubkey][subsubsubkey]
                            
                            
                        else:
                            r[str(key)+'_'+str(subkey)+'_'+str(subsubkey)] = item[key][subkey][subsubkey]
                else:
                    r[str(key)+'_'+str(subkey)] = item[key][subkey]
        else:
            r[key] = value
            
#     print(item)
    try:
        r['post_text'] = item['edge_media_to_caption']['edges'][0]['node']['text']
    except:
        pass
    
    post.append(r)
    return pd.DataFrame(post)

In [None]:
def parse_instagram(filesave):
    files = os.listdir()
    file_posts = [item for item in files if '.json' in item and 'comments' not in item]
    file_comments = [item for item in files if '.json' in item and 'comments' in item]
    print('files found for posts:', file_posts)
    print('files found for comments:', file_comments)
    posts = pd.DataFrame()
    posts_comments = pd.DataFrame()
    comments = pd.DataFrame()
    for file in file_posts:
        with open(file, 'r') as f:
            data = json.loads(f.read())
        print('started with', file)
        p = {}
        try:
            comment_edges = data['node']['edge_media_to_comment']['edges']
            del data['node']['edge_media_to_comment']['edges']
            post_id = data['node']['id']
            posts_comments = posts_comments.append(parse_comments(comment_edges, post_id = post_id))
        except:
            print('issue with comments for', file)
        posts = posts.append(parse_post(data['node'], file))
        print('completed', file)
    
    try:
        posts_comments.to_pickle(filesave + '_comments.pkl')
    except:
        pass
    
    posts.to_pickle(filesave + '_posts.pkl')
    
    print('files found for comments:', file_comments)
    for file in file_comments:
        with open(file, 'r') as f:
            data = json.loads(f.read())
        print('started with', file)
        #p = {}
        comments = comments.append(parse_comments(file,data, post_id = None))
        print('completed', file)
    comments.to_pickle(filesave + '_comments.pkl')
        
        
        
    

    
#     return posts, posts_comments, comments
    





Run the function below (by running the cell) and change the filename to the name of the file you want the results to be saved as. Do not include dots or extension. If you put 'test_1', for example, two files will be generated: 'test_1_comments.pkl', with the comments, and 'test_1_posts.pkl', with the posts that are included in the folder. Column "post_comment_id" links comments file with posts file (allowing you to connect/merge the dataframes).

In [None]:
parse_instagram('filename')