# Import text from the JSON file for SNA

Go through all comments (and comments of comments) and convert each post into a row of the final dataframe.

In [1]:
# load libraries
import json 
import pandas as pd

In [2]:
# read json file

with open("./CSMM101_3T2020_discussion_threads.json") as f:
    discussions = json.load(f)

f.close()

In [3]:
# read_comment is a recursive function to read all comments (and answers)
def read_comment(comment, output, user_to_passed, post_type_passed):
    # List of input vars
    #   comment: dict type. the post (original post or its comment, including comment of comment) to be read.
    #   output: dataframe type. output is a dataframe that treat each post (and comment) as a row.
    #   user_to_passed: dict type, {'name', 'role'}.
    #       to pass user information from upper level post (comment) to next level.
    #   post_type_passed: str type. similar to user_to_passed, but to pass post type.

    # read post author of the current level post
    author = comment['user']['name']
    author_role = comment['user']['role']

    # create user info dict to pass to next level
    user_to = {'name': None, 'role': None}

    # Inherit post information from the record (row) above.
    # Because comments usually do not have 'type', 'title', 'category', 'subcategory', or 'private'
    for elem in df_col_names[0:-6] :
        if elem not in comment :
            #print("elem not in list")
            #print(len(output))
            comment[elem] = output.iloc[-1][elem]

    # If comments exist, author info will be passed to next level posts as user_to.
    if comment['comments'] : # comments is not empty
        user_to['name'] = author
        user_to['role'] = author_role
    else : # comments is empty
        user_to['name'] = None
        user_to['role'] = None

    # Answers will be marked as answer type.
    if post_type_passed == 'answer' :
        comment['type'] = post_type_passed

    # Add new record to existing output dataframe.
    # Each new record (row) contains all information related to the post.
    output = output.append({'url': comment['url'],
                    'type': comment['type'],
                    'title': comment['title'],
                    'category': comment['category'],
                    'subcategory': comment['subcategory'],
                    'votes': comment['votes'],
                    'private': comment['private'],
                    'created_at': comment['created_at'],
                    'text': comment['text'],
                    'user_from': author,
                    'user_from_role': author_role,
                    'user_to': user_to_passed['name'],
                    'user_to_role': user_to_passed['role']}, ignore_index=True)

    # Go over all comments/answers
    replies = comment['comments']
    for reply in replies:
        replier = reply['user']['name']
        replier_role = reply['user']['role']
        output.iloc[-1]['user_from'] = replier
        output.iloc[-1]['user_from_role'] = replier_role
        
        # recursive part
        output = read_comment(reply, output, user_to, None)

    if 'answers' in comment :
        answers = comment['answers']
        user_to['name'] = author
        user_to['role'] = author_role
        for answer in answers:
            replier = answer['user']['name']
            replier_role = answer['user']['role']
            output.iloc[-1]['user_from'] = replier
            output.iloc[-1]['user_from_role'] = replier_role
            output = read_comment(answer, output, user_to, 'answer')

    return output

# Create a dataframe (df_SNA) to store data for every post, comment, and answer.
df_col_names = [str(element) for element in list(discussions[0].keys())]
df_col_names += ['user_from']
df_col_names += ['user_from_role']
df_col_names += ['user_to']
df_col_names += ['user_to_role']
df_col_names.remove('user')
df_col_names.remove('document')
df_col_names.remove('comments')

result = pd.DataFrame([], columns = df_col_names)

for discussion in discussions:
    user = {'name': None, 'role': None}
    result = read_comment(discussion, result, user, None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.iloc[-1]['user_from'] = replier
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.iloc[-1]['user_from_role'] = replier_role
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.iloc[-1]['user_from'] = replier
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.iloc[-1]['user_from_role'] = repli

In [4]:
# Create thread_id based on url.
result['thread_id'] = result['url']
result['thread_id'] = result['url'].str.split("?").str.get(0).str.split("courses/").str.get(1)

# Save to csv file.
result.to_csv("./flatten_json.csv", sep=',', index=False)