# ChatGPT History Export Parser

Simple parser code to convert your ChatGPT Conversation History into a CSV for personal analysis.

✔ Export your ChatGPT history and data following OpenAI's documentation [here](https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history-and-data).

-----

## Dependencies

In [1]:
%pip install tqdm numpy matplotlib pandas ipywidgets jupyter

Note: you may need to restart the kernel to use updated packages.


In [2]:
!jupyter nbextension enable --py widgetsnbextension

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook qtconsole run server
troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [3]:
# import argparse
import json
import os
import re
from collections import defaultdict
from typing import Any

from tqdm.notebook import tqdm

from datetime import datetime
# from datetime import date, datetime as dt, timedelta as td
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# check python version
# import sys
# sys.version_info

-----

## Basic Configurations inc File Paths for Conversations and Output

In [5]:
json_filepath = 'data/conversations.json'

In [6]:
out_folder = 'output'

-----

## Simple Data Check and Data Analysis

In [7]:
with open(json_filepath, "r") as file:
    conversations = json.load(file)

In [8]:
# Uncomment to view single raw convo example
# conversations[14]

In [9]:
title_occurrences: defaultdict[str, int] = defaultdict(int)
total_conversations: int = len(conversations)

In [10]:
print("Total Conversations =", total_conversations)

Total Conversations = 1670


----

## Parser and Export to CSV

In [11]:
# Config
# user_name = "Me" # user_name = "Mark Koester"
user_name = "User"
assistant_name = "ChatGPT"
date_format = "%m-%d-%Y"
file_name_format = "{title}"
include_date = True
message_separator = "\n\n"
skip_empty_messages = True

In [12]:
# type(conversations)
# conversations = conversations[5:6]
len(conversations)

1670

In [13]:
def process_conversations_to_df(data):
    convo_msgs = []
    for conversation in tqdm(data, desc="Processing conversations"):
        title = conversation["title"]
        mapping = conversation["mapping"]

        # Extract messages from the "mapping" key
        messages = [mapping[key]["message"] for key in mapping if mapping[key]["message"] is not None]

        # Sort messages by their create_time
        messages.sort(key=lambda x: x["create_time"] if x["create_time"] is not None else float('-inf'))

        if messages and messages[0]["create_time"] is not None:
            conversation_start = datetime.fromtimestamp(messages[0]["create_time"]).strftime(date_format)
        # conversation_start = datetime.fromtimestamp(messages[0]["create_time"]).strftime(date_format)

        msg_list = []
        for message in messages:
            try:
                author_role = message["author"]["role"]
                msg_content = message["content"]["parts"][0]
                msg_date = ''
                if message["create_time"] is not None:
                    msg_date = datetime.fromtimestamp(message["create_time"]).strftime('%Y-%m-%d %H:%M:%S')
                    # msg_date = datetime.fromtimestamp(message["create_time"])
                author_name = user_name if author_role == "user" else assistant_name

                # print(title + " on " + msg_date + " - " +author_name + " " +  msg_content)
                
                msg = {
                    'conversation_title': title,
                    'author': author_name,
                    # 'conversation_date': conversation_start,
                    'message': msg_content,
                    'msg_date': msg_date
                }
                msg_list.append(msg)
            except Exception as e:
                print("Error: ", e)
                print("Message: ", message)
                # write a file per exception
                with open(f'output/error_{title}.json', 'w') as f:
                    json.dump(message, f)
                
        convo_msgs.extend(msg_list)
        
    #convo_msgs.append(msg_list)
    convo_msgs_df = pd.DataFrame(convo_msgs)
    return convo_msgs_df

In [14]:
# process_conversations_to_df(conversations)

In [15]:
convo_msgs = process_conversations_to_df(conversations)

Processing conversations:   0%|          | 0/1670 [00:00<?, ?it/s]

Error:  'parts'
Message:  {'id': '343c779e-654b-4293-85c3-c990b30c4115', 'author': {'role': 'assistant', 'name': None, 'metadata': {}}, 'create_time': 1712991236.609526, 'update_time': None, 'content': {'content_type': 'code', 'language': 'unknown', 'text': 'search("open source software for automatic timeline creation")'}, 'status': 'finished_successfully', 'end_turn': False, 'weight': 1.0, 'metadata': {'finish_details': {'type': 'stop', 'stop_tokens': [100265]}, 'gizmo_id': None, 'is_complete': True, 'message_type': None, 'model_slug': 'gpt-4', 'default_model_slug': 'gpt-4', 'parent_id': 'aaa2d745-29d3-4518-afcc-05b891ba84ca', 'request_id': '87398a9ce9a030c7-ICN', 'timestamp_': 'absolute'}, 'recipient': 'browser'}
Error:  'parts'
Message:  {'id': '167c8b4b-0679-4c2a-b006-50bb4c2d9649', 'author': {'role': 'tool', 'name': 'browser', 'metadata': {}}, 'create_time': 1712991236.612271, 'update_time': None, 'content': {'content_type': 'tether_browsing_display', 'result': '# 【0†The Best Time

In [16]:
# convo_msgs.head()

In [17]:
len(convo_msgs)

14911

In [18]:
convo_msgs.to_csv("data/chatgpt_messages.csv", index=None, encoding='utf-8')

----

## Exporter to Markdown Files per Conversation

In [19]:
# Pre-compiled pattern for disallowed characters in file names
DISALLOWED_CHARS_PATTERN = re.compile(r'[<>:"/\\|?*\n\r\t\f\v]')

In [20]:
def process_conversations_to_markdown(data, output_dir):
    for conversation in tqdm(data, desc="Processing conversations"):
        title = conversation["title"]
        mapping = conversation["mapping"]

        # Extract messages from the "mapping" key
        messages = [mapping[key]["message"] for key in mapping if mapping[key]["message"] is not None]

        # Sort messages by their create_time
        messages.sort(key=lambda x: x["create_time"] if x["create_time"] is not None else float('-inf'))

        if messages and messages[1]["create_time"] is not None:
            conversation_start = datetime.fromtimestamp(messages[1]["create_time"]).strftime("%Y%m%d%H%M")
            # sanitize title to ensure it's a valid filenames
            title = ''.join(c for c in title if c.isalnum() or c in [' ', '_']).rstrip()
            title = conversation_start + "_" + title
            file_name = f"{file_name_format.format(title=title.replace(' ', '_').replace('/', '_'))}.md"
            file_path = os.path.join(output_dir, file_name)

            with open(file_path, "w", encoding="utf-8") as f:
                if messages and messages[1]["create_time"] is not None and include_date:
                    date = datetime.fromtimestamp(messages[1]["create_time"]).strftime(date_format)
                    f.write(f"<sub>{date}</sub>{message_separator}")
    
                for message in messages:
                    try:
                        author_role = message["author"]["role"]
                        msg_content = message["content"]["parts"][0]
                        msg_date = ''
                        if message["create_time"] is not None:
                            msg_date = datetime.fromtimestamp(message["create_time"]).strftime('%Y-%m-%d %H:%M:%S')
                        # msg_date = datetime.fromtimestamp(message["create_time"])
                        author_name = user_name if author_role == "user" else assistant_name
                        
                        # print(title + " on " + msg_date + " - " +author_name + " " +  msg_content)
                        f.write(f"_{author_name} on {msg_date}_: {msg_content}{message_separator}")
                    except Exception as e:
                        print("Error: ", e)
                        print("Message: ", message)
                        # write a file per exception
                        with open(f'output/error_{title}.json', 'w') as f:
                            json.dump(message, f)

In [21]:
output_dir = "ai_conversation_notes"

In [23]:
# uncomment to render as markdown files with date appended at beginning of file
process_conversations_to_markdown(conversations, output_dir)

Processing conversations:   0%|          | 0/1670 [00:00<?, ?it/s]

Error:  'parts'
Message:  {'id': '343c779e-654b-4293-85c3-c990b30c4115', 'author': {'role': 'assistant', 'name': None, 'metadata': {}}, 'create_time': 1712991236.609526, 'update_time': None, 'content': {'content_type': 'code', 'language': 'unknown', 'text': 'search("open source software for automatic timeline creation")'}, 'status': 'finished_successfully', 'end_turn': False, 'weight': 1.0, 'metadata': {'finish_details': {'type': 'stop', 'stop_tokens': [100265]}, 'gizmo_id': None, 'is_complete': True, 'message_type': None, 'model_slug': 'gpt-4', 'default_model_slug': 'gpt-4', 'parent_id': 'aaa2d745-29d3-4518-afcc-05b891ba84ca', 'request_id': '87398a9ce9a030c7-ICN', 'timestamp_': 'absolute'}, 'recipient': 'browser'}
Error:  'parts'
Message:  {'id': '167c8b4b-0679-4c2a-b006-50bb4c2d9649', 'author': {'role': 'tool', 'name': 'browser', 'metadata': {}}, 'create_time': 1712991236.612271, 'update_time': None, 'content': {'content_type': 'tether_browsing_display', 'result': '# 【0†The Best Time