In [1]:
import pandas as pd

import json

with open('gg2013.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data to a pandas DataFrame
df = pd.DataFrame(data)

# Extract user information into separate columns
df['user_screen_name'] = df['user'].apply(lambda x: x['screen_name'])
df['user_id'] = df['user'].apply(lambda x: x['id'])

# Drop the original 'user' column as we've extracted the needed information
df = df.drop('user', axis=1)

# Convert timestamp_ms to datetime
df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit='ms')

# Drop the original timestamp_ms column
df = df.drop('timestamp_ms', axis=1)

# Reorder columns for better readability
df = df[['id', 'timestamp', 'user_id', 'user_screen_name', 'text']]

# Display the first few rows of the DataFrame
print(df.head())

                   id           timestamp    user_id user_screen_name  \
0  290620657987887104 2013-01-14 00:45:38  557374298        Dozaaa_xo   
1  290620657887219713 2013-01-14 00:45:38   14648726     theAmberShow   
2  290620657828524032 2013-01-14 00:45:38   35498686         SweetyPW   
3  290620657799159809 2013-01-14 00:45:38  144430208   _NicoleEdwards   
4  290620657778188288 2013-01-14 00:45:38  134953223    lolaogunnaike   

                                                text  
0             JLo's dress! #eredcarpet #GoldenGlobes  
1  What's making Sofia Vergara's boobs stay like ...  
2  RT @FabSugar: Kerry Washington is EVERYTHING. ...  
3     Anne Hathaway has got me living. #GoldenGlobes  
4  Jennifer Lopez's lace dress? Thoughts? #Golden...  


In [8]:
import re

# Function to apply regex patterns and extract potential winners
def extract_winners(text, award):
    # Improved regex to properly handle 'just' variations
    just_variations = r'(?:(?:(?:she|he)\s+)?just\s+)?'
    winner_patterns = [
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'wins\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'won\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'awarded\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'receives\s+(?!' + award + ')',
        r'(\w+(?:\s+\w+)?)\s+' + just_variations + r'received\s+(?!' + award + ')'
    ]
    winners = []
    for pattern in winner_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        winners.extend(matches)
    return winners

# Apply the extraction function to the 'text' column
df['potential_winners'] = df['text'].apply(lambda x: extract_winners(x, "Best Picture"))

# Print all non-NaN values in the potential_winners column
all_winners = df['potential_winners'].dropna()
winner_counts = {}
for winners in all_winners:
    if winners:  # Check if the list is not empty
        for winner in winners:
            if winner in winner_counts:
                winner_counts[winner] += 1
            else:
                winner_counts[winner] = 1

# Create the JSON structure
output = {
    "Award": "Best Picture",
    "Nominees": [],  # We don't have nominee information in the current data
    "Presenters": [],  # We don't have presenter information in the current data
    "Winner": [
        {
            "Name": winner,
            "Number of Tweets": count
        } for winner, count in winner_counts.items()
    ]
}

# Sort the winners by number of tweets in descending order
output["Winner"] = sorted(output["Winner"], key=lambda x: x["Number of Tweets"], reverse=True)

print(json.dumps(output, indent=4))

{
    "Award": "Best Picture",
    "Nominees": [],
    "Presenters": [],
    "Winner": [
        {
            "Name": "she",
            "Number of Tweets": 319
        },
        {
            "Name": "Adele",
            "Number of Tweets": 314
        },
        {
            "Name": "Affleck",
            "Number of Tweets": 302
        },
        {
            "Name": "Hathaway",
            "Number of Tweets": 279
        },
        {
            "Name": "Jackman",
            "Number of Tweets": 240
        },
        {
            "Name": "Lawrence",
            "Number of Tweets": 234
        },
        {
            "Name": "Lewis",
            "Number of Tweets": 225
        },
        {
            "Name": "Waltz",
            "Number of Tweets": 169
        },
        {
            "Name": "Tarantino",
            "Number of Tweets": 145
        },
        {
            "Name": "Argo",
            "Number of Tweets": 134
        },
        {
            "Name": "Dunham",


Planned output: 
{ 
    Award: Best Picture
    Nominees: [a, b, c, d, e]
    Presenters: [d, e, f]
    Winner: {
        {
            Name: John
            Number of Tweets: 5
        },
        {
            Name: Jane
            Number of Tweets: 3
        },
        ...
    }
}