In [3]:
import json
import pandas as pd

def extract_tweets_from_har(file_path):
    # Load HAR file
    with open(file_path, 'r', encoding='utf-8') as f:
        har_data = json.load(f)
    
    tweets = []
    # Iterate through HAR entries to extract tweets
    for entry_num, entry in enumerate(har_data['log']['entries']):
        if 'text' in entry['response']['content']:
            response_text = entry['response']['content']['text']
            try:
                response_json = json.loads(response_text)
                # Dig deeper into the JSON to account for possible structure variations
                tweet_texts = json_extract(response_json, 'full_text')
                tweet_times = json_extract(response_json, 'created_at')
                for tweet_text, tweet_time in zip(tweet_texts, tweet_times):
                    tweets.append({
                        "tweet_body": tweet_text,
                        "tweet_time": tweet_time
                    })
            except json.JSONDecodeError:
                print(f"Failed to process entry {entry_num} - possible structure discrepancy.")
    return tweets

def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []
    def extract(obj, arr, key):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr
    values = extract(obj, arr, key)
    return values

def tweets_to_dataframe(tweets):
    # Convert tweet details to Pandas DataFrame
    df = pd.DataFrame(tweets)
    
    # Add a binary flag for when tweet_body contains "Train"
    df['has_train'] = df['tweet_body'].apply(lambda x: 1 if "Train" in x else 0)
    
    return df

# Step 1: Extract tweets from HAR file
#har_file_path = "/Users/troys/OneDrive/Documents/Python/BeeEye/TRAINS - X HAR FILE/twitter.com.har"
har_file_path = "/Users/troys/OneDrive/Documents/Python/BeeEye/TRAINS - X HAR FILE/trains_Aug24_Oct22.har"
tweets = extract_tweets_from_har(har_file_path)

# Step 2: Convert to DataFrame
df = tweets_to_dataframe(tweets)

print(df)

Failed to process entry 0 - possible structure discrepancy.
Failed to process entry 1 - possible structure discrepancy.
Failed to process entry 2 - possible structure discrepancy.
Failed to process entry 3 - possible structure discrepancy.
Failed to process entry 4 - possible structure discrepancy.
Failed to process entry 5 - possible structure discrepancy.
Failed to process entry 7 - possible structure discrepancy.
Failed to process entry 8 - possible structure discrepancy.
Failed to process entry 9 - possible structure discrepancy.
Failed to process entry 10 - possible structure discrepancy.
Failed to process entry 11 - possible structure discrepancy.
Failed to process entry 13 - possible structure discrepancy.
Failed to process entry 14 - possible structure discrepancy.
Failed to process entry 15 - possible structure discrepancy.
Failed to process entry 16 - possible structure discrepancy.
Failed to process entry 17 - possible structure discrepancy.
Failed to process entry 18 - poss

In [None]:
df.to_csv('trains_Aug24_Oct22.csv')