# Purpose and use

This Jupyter Notebook allows for the time series and general analysis of the scraped Wikipedia revisions.

# Imports

In [60]:
%load_ext autoreload
%autoreload 2

scripts_dir = Path().resolve().parent
sys.path.append(str(scripts_dir))
from config import *

import pyarrow.feather as feather

## importing common data
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gc

import numpy as np
import pandas as pd
from PIL import Image

# plot
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

# annotations
from highlight_text import fig_text, ax_text
from matplotlib.patches import FancyArrowPatch

import re

#import go
from plotly import graph_objects as go
import mwparserfromhell
from datetime import datetime, timedelta
import pytz


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#clean up memory
gc.collect()

0

In [3]:
#writing to reqs
!pip3 freeze > requirements.txt

# Read in Data

To facilitate reading in the large amount of data, we'll write a helper function to read the feather dfs in chunks.

In [4]:
#declare file paths
file_path_Taylor = "/Volumes/PSSD T7/Taylor_Swift.feather"
file_path_Kanye = "/Volumes/PSSD T7/Kanye_West.feather"

In [5]:
#write helper function
def read_feather_in_chunks(file_path, chunk_size):
    table = feather.read_table(file_path)
    df = table.to_pandas()
    total_rows = len(df)
    for start in range(0, total_rows, chunk_size):
        yield df[start:start + chunk_size]

In [6]:
#reading in data: Taylor Swift
chunk_size = 2000
chunks = []

for chunk in read_feather_in_chunks(file_path_Taylor, chunk_size):
    chunks.append(chunk)

ts_df = pd.concat(chunks, ignore_index=True)

In [7]:
#clean up memory
gc.collect()

0

In [8]:
#reading in data: Kanye West
chunk_size = 2000
chunks = []

for chunk in read_feather_in_chunks(file_path_Kanye, chunk_size):
    chunks.append(chunk)

ky_df = pd.concat(chunks, ignore_index=True)

In [9]:
#check
ts_df.head()

Unnamed: 0,revision_id,timestamp,username,userid,comment,text_length,text,year,month,day
0,1253532621,2024-10-26 14:03:32+00:00,Jessintime,44776426,Undid revision [[Special:Diff/1253531462|12535...,359306,{{Short description|American singer-songwriter...,Taylor_Swift,2024,10
1,1253531462,2024-10-26 13:55:46+00:00,Chuckles987,48629687,Updated short description,359295,{{Short description|American singer (born 1989...,Taylor_Swift,2024,10
2,1253458894,2024-10-26 02:49:42+00:00,NegativeMP1,45135659,Restored revision 1253426413 by [[Special:Cont...,359306,{{Short description|American singer-songwriter...,Taylor_Swift,2024,10
3,1253458276,2024-10-26 02:44:43+00:00,Hammsster,47189514,,359462,{{Short description|American singer-songwriter...,Taylor_Swift,2024,10
4,1253457996,2024-10-26 02:42:36+00:00,Hammsster,47189514,ancestry,359577,{{Short description|American singer-songwriter...,Taylor_Swift,2024,10


In [10]:
#check
ky_df.head()

Unnamed: 0,revision_id,timestamp,username,userid,comment,text_length,text,year,month,day
0,1253370019,2024-10-25 16:44:50+00:00,BigChungusOnVinyl,35552696,/* Discography */,291490,{{Short description|American rapper and produc...,Kanye_West,2024,10
1,1253369815,2024-10-25 16:43:20+00:00,ULPS,44938552,/* 2023–present: Vultures trilogy and Bully */...,291472,{{Short description|American rapper and produc...,Kanye_West,2024,10
2,1253368276,2024-10-25 16:33:36+00:00,BigChungusOnVinyl,35552696,/* Discography */,291440,{{Short description|American rapper and produc...,Kanye_West,2024,10
3,1253106242,2024-10-24 11:18:59+00:00,Thedogishere931,48083651,/* Discography */,291470,{{Short description|American rapper and produc...,Kanye_West,2024,10
4,1252589484,2024-10-22 01:11:08+00:00,Speakfor23,46861074,/* Other relationships */,291440,{{Short description|American rapper and produc...,Kanye_West,2024,10


# Plot Timeseries

In [85]:
ky_df_D = ky_df

In [86]:
ky_df_D['timestamp'] = pd.to_datetime(ky_df_D['timestamp'])
ky_df_D.set_index('timestamp', inplace=True)

In [15]:
ts_df_D = ts_df
ts_df_D['timestamp'] = pd.to_datetime(ts_df_D['timestamp'])
ts_df_D.set_index('timestamp', inplace=True)

In [16]:
ts_df_D = ts_df_D.resample('M').count()

  ts_df_D = ts_df_D.resample('M').count()  # Daily count of revisions


In [87]:
ky_df_D = ky_df_D.resample('M').count()


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [18]:
fig = px.line(ky_df_D, x=ky_df_D.index, y='revision_id', title='Wikipedia Page Revisions Over Time -  Kanye West')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Number of Revisions')

fig.show()
fig.write_html("/Volumes/PSSD T7/KW_revisions.html")


In [19]:
fig = px.line(ts_df_D, x=ts_df_D.index, y='revision_id', title='Wikipedia Page Revisions Over Time -  Kanye West')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Number of Revisions')

fig.show()
fig.write_html("/Volumes/PSSD T7/TS_revisions.html")


In [88]:
def preprocess(content):
    #remove unncess characters
    to_remove = ['url','https','org','cite','Cite','status','archive','web','title','access','date','ref','January','February','March','April','May','June','July','August','September','October','November','December']
    content = re.sub(r'\\[a-z]+\d*', '', content)
    content = re.sub(r'{|}', '', content)
    content = re.sub(r'\\\'..', '', content)
    content = re.sub(r'\s+', ' ', content)
    start = content.find("Taylor Alison Swift")
    content = content[start:] if start != -1 else content

    for word in to_remove:
        content = content.replace(word, '')

    return content

In [89]:
#prep for wordcloud TS
text = ts_df.iloc[0]['text']
content = preprocess(text)

In [83]:
#create wordcloud

wordcloud = WordCloud(
    width=1500, height=600,
    background_color='white',
    colormap="copper",
    contour_width=1,
    contour_color='brown',
    random_state=2
).generate(content)
#save & load
wordcloud.to_file("wordcloud_image.png")
img = Image.open("wordcloud_image.png")
img_array = np.array(img)

#create fig
fig = go.Figure()
fig.add_layout_image(
    dict(
        source=img,
        xref="paper", yref="paper",
        x=0, y=1,
        sizex=1, sizey=1,
        xanchor="left", yanchor="top",
        opacity=0.13,
        layer="below"
    )
)
fig.add_trace(
    go.Scatter(
        x=ts_df_D.index, y=ts_df_D['revision_id'],
        mode='lines', line=dict(color='#8c564b'),
        name="Number of Edits"
    )
)
fig.add_trace(
    go.Scatter(
        x=ts_df_D.index, y=ts_df_D['revision_id'],
        mode='markers', marker=dict(
            color=ts_df_D['revision_id'],
            colorscale='rdbu',
            showscale=False, line=dict(width=0.5, color='black')),
        name="Data Points"
    )
)
fig.update_layout(
    title_text=f"Editing History of <b>Taylor Swift</b>'s Wikipedia Page",
    title_x=0.5,
    xaxis=dict(title="Date"),
    yaxis=dict(title="Number of Wikipedia Page Edits"),
    title_font=dict(size=35),
    annotations=[
        dict(
            x="2012-05", y=662, 
            text="May 2012: Billboard Music Awards", showarrow=True,
            arrowhead=2, arrowsize = 1.5,
            font=dict(size=15, color="#5D4037"),
            bgcolor = "#CCA677"
        ),
        dict(
            x="2009-09", y=211, 
            text="Sep 2009: VMAs", showarrow=True,
            arrowhead=2, arrowsize = 1.5,
            font=dict(size=15, color="#5D4037"),
            bgcolor = "#CCA677"
        )
    ],
    font_family = "Raleway",
    font_color = "#5D4037",

        width=1200,
    height=600
)

fig.update_layout(showlegend=False)
fig.show()

In [90]:
#kanye
text = ky_df.iloc[0]['text']
content = preprocess(text)

In [94]:
#create wordcloud
wordcloud = WordCloud(
    width=1500, height=600,
    background_color='white',
    colormap="copper",
    contour_width=1,
    contour_color='brown',
    random_state=2
).generate(content)

#save & load
wordcloud.to_file("wordcloud_image.png")
img = Image.open("wordcloud_image.png")
img_array = np.array(img)

#create fig
fig = go.Figure()
fig.add_layout_image(
    dict(
        source=img,
        xref="paper", yref="paper",
        x=0, y=1,
        sizex=1, sizey=1,
        xanchor="left", yanchor="top",
        opacity=0.13,
        layer="below"
    )
)

fig.add_trace(
    go.Scatter(
        x=ky_df_D.index, y=ky_df_D['revision_id'],
        mode='lines', line=dict(color='#8c564b'),
        name="Number of Edits"
    )
)

fig.add_trace(
    go.Scatter(
        x=ky_df_D.index, y=ky_df_D['revision_id'],
        mode='markers', marker=dict(
            color=ky_df_D['revision_id'],
            colorscale='rdbu',
            showscale=False, line=dict(width=0.5, color='black')),
        name="Data Points"
    )
)

fig.update_layout(
    title_text=f"Editing History of <b>Kanye West</b>'s Wikipedia Page",
    title_x=0.5,
    xaxis=dict(title="Date"),
    yaxis=dict(title="Number of Wikipedia Page Edits"),
    title_font=dict(size=35),
    annotations=[
        dict(
            x="2022-12", y=540, 
            text="Dec 2022: Antisemitic Remarks", showarrow=True,
            arrowhead=2, arrowsize = 1.5,
            font=dict(size=15, color="#5D4037"),
            bgcolor = "#CCA677"
        ),
        dict(
            x="2009-09", y=166, 
            text="Sep 2009: VMAs", showarrow=True,
            arrowhead=2, arrowsize = 1.5,
            font=dict(size=15, color="#5D4037"),
            bgcolor = "#CCA677"
        )
    ],
    font_family = "Raleway",
    font_color = "#5D4037",
        width=1200,
    height=600
)

fig.update_layout(showlegend=False)
fig.show()

# Editors analysis

In [38]:
#reset index
ky_df = ky_df.reset_index()
ts_df = ts_df.reset_index()

In [34]:
#value counts on userid
ky_df['userid'].value_counts()

userid
16899771    492
19493411    242
4293477     240
6661051     199
19507760    107
           ... 
14639338      1
33086960      1
24373534      1
33087499      1
1803961       1
Name: count, Length: 2886, dtype: int64

In [35]:
#value counts on userid
ts_df['userid'].value_counts()

userid
6138283     2375
27173459    1935
19895717    1219
35372709     391
19269270     324
            ... 
19042454       1
31893297       1
742316         1
11673237       1
15904812       1
Name: count, Length: 3015, dtype: int64

In [36]:
#comparing editors
ky_editors = ky_df['userid'].unique()
ts_editors = ts_df['userid'].unique()
common_editors = set(ky_editors).intersection(ts_editors)
print(common_editors)

{'82432', '39358722', '1495172', '28331428', '14863013', '12783610', '1574439', '36389', '7713040', '14508071', '6303058', '10199389', '7723863', '1879566', '33456653', '27173459', '40327909', '12765036', '13272108', '20588153', '33609231', '35667443', '1152308', '18562586', '12847129', '4088067', '1215485', '7098284', '35999521', '736651', '12374079', '4904587', '7665210', '36823859', '28539235', '7903804', '19986047', '623801', '13006032', '929196', '13350630', '1286970', '18915484', '24198', '40943757', '1341640', '2968397', '5627478', '40385031', '372290', '4885781', '42233556', '17906789', '16283967', '7662999', '20438250', '1755837', '28761571', '43881889', '19514078', '8168717', '8729451', None, '38455', '13930115', '4207917', '409043', '26073722', '7320905', '30699314', '4788526', '29919047', '33747395', '44120587', '43179530', '101140', '9107368', '8213825', '7667333', '1826119', '764407', '18173407', '45645', '23530426', '25571107', '25082147', '118722', '3479822', '196446', 

# Analysis VMA

In [39]:
VMA_date = datetime.strptime("2009-09-13", "%Y-%m-%d").replace(tzinfo=pytz.UTC)

In [40]:
date_one_week_before = VMA_date - timedelta(days=7)
date_one_week_after = VMA_date + timedelta(days=7)

In [41]:
ts_df['timestamp'] = pd.to_datetime(ts_df['timestamp'])
ky_df['timestamp'] = pd.to_datetime(ky_df['timestamp'])

In [42]:
ts_df_VMA = ts_df[(ts_df['timestamp'] > date_one_week_before) & (ts_df['timestamp'] < date_one_week_after)]
ky_df_VMA = ky_df[(ky_df['timestamp'] > date_one_week_before) & (ky_df['timestamp'] < date_one_week_after)]

In [43]:
ts_df_VMA_after = ts_df[(ts_df['timestamp'] > VMA_date) & (ts_df['timestamp'] < date_one_week_after)]
ky_df_VMA_after = ky_df[(ky_df['timestamp'] > VMA_date) & (ky_df['timestamp'] < date_one_week_after)]

In [44]:
ts_before = ts_df[(ts_df['timestamp'] > date_one_week_before) & (ts_df['timestamp'] < VMA_date)].count()
ts_during = ts_df[(ts_df['timestamp'].dt.date == VMA_date.date())].count()
ts_after = ts_df[(ts_df['timestamp'] > VMA_date) & (ts_df['timestamp'] < date_one_week_after)].count()


In [45]:
ky_before = ky_df[(ky_df['timestamp'] > date_one_week_before) & (ky_df['timestamp'] < VMA_date)].count()
ky_during = ky_df[(ky_df['timestamp'].dt.date == VMA_date.date())].count() 
ky_after = ky_df[(ky_df['timestamp'] > VMA_date) & (ky_df['timestamp'] < date_one_week_after)].count()

In [48]:
ts_df_VMA_after['parsed_text'] = ts_df_VMA_after['text'].apply(lambda x: mwparserfromhell.parse(x).strip_code())
ky_df_VMA_after['parsed_text'] = ky_df_VMA_after['text'].apply(lambda x: mwparserfromhell.parse(x).strip_code())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [49]:
from difflib import ndiff

def find_text_differences(text1, text2):
    diff = list(ndiff(text1.split(), text2.split()))
    added = [word[2:] for word in diff if word.startswith('+ ')]
    removed = [word[2:] for word in diff if word.startswith('- ')]
    unchanged = [word[2:] for word in diff if word.startswith('  ')]
    
    return {
        "added": added,
        "removed": removed
    }

In [None]:
def find_text_differences_2(text1, text2):
    if text1 is None or text2 is None:
        return {"added": [], "removed": []}
    return find_text_differences(text1, text2)

In [50]:
ts_df_VMA_after = ts_df_VMA_after.sort_values(by='timestamp')
ts_df_VMA_after['text_diff'] = ts_df_VMA_after['text'].shift().combine(ts_df_VMA_after['text'], find_text_differences_2)

ts_df_VMA_after.head()

Unnamed: 0,timestamp,revision_id,username,userid,comment,text_length,text,year,month,day,parsed_text,text_diff
16624,2009-09-13 00:33:49+00:00,313473831,Fedisking,1883650,/* Personal life */ Reference to Tyler Dean's ...,51847,{{Otheruses4|the musician|her self-titled debu...,Taylor_Swift,2009,9,"Taylor Alison Swift (born December 13, 1989) i...","{'added': [], 'removed': []}"
16623,2009-09-13 01:23:11+00:00,313481073,Cwateyou,9154284,/* Band */,51881,{{Otheruses4|the musician|her self-titled debu...,Taylor_Swift,2009,9,"Taylor Alison Swift (born December 13, 1989) i...","{'added': ['*', 'Charlotte', 'Ray&nbsp;–', '[[..."
16622,2009-09-13 01:24:21+00:00,313481277,Cwateyou,9154284,/* Band */,51883,{{Otheruses4|the musician|her self-titled debu...,Taylor_Swift,2009,9,"Taylor Alison Swift (born December 13, 1989) i...","{'added': ['Stripper', 'Haynes&nbsp;–'], 'remo..."
16621,2009-09-13 01:24:59+00:00,313481360,Cwateyou,9154284,/* Band */,51847,{{Otheruses4|the musician|her self-titled debu...,Taylor_Swift,2009,9,"Taylor Alison Swift (born December 13, 1989) i...","{'added': [], 'removed': ['*', 'Stripper', 'Ha..."
16620,2009-09-13 01:26:49+00:00,313481608,Cwateyou,9154284,,51865,{{Otheruses4|the musician|her self-titled debu...,Taylor_Swift,2009,9,"Taylor Alison Swift (born December 13, 1989) i...","{'added': ['(band)|Gloriana]],', '[[Selena', '..."


In [51]:
ky_df_VMA_after = ky_df_VMA_after.sort_values(by='timestamp')
ky_df_VMA_after['text_diff'] = ky_df_VMA_after['text'].shift().combine(ky_df_VMA_after['text'], find_text_differences_2)
ky_df_VMA_after.head()

Unnamed: 0,index,timestamp,revision_id,username,userid,comment,text_length,text,year,month,day,parsed_text,text_diff
8214,8214,2009-09-14 01:43:12+00:00,313708718,Ricky3374,10347019,,57681,{{pp-semi|small=yes}}\n{{Infobox Musical artis...,Kanye_West,2009,9,"Kanye Omari West (; born June 8, 1977) is an A...","{'added': [], 'removed': []}"
8213,8213,2009-09-14 01:43:35+00:00,313708801,E1foley,1800521,/* Awards and controversy (2006) */,58154,{{pp-semi|small=yes}}\n{{Infobox Musical artis...,Kanye_West,2009,9,"Kanye Omari West (; born June 8, 1977) is an A...","{'added': ['During', 'the', '2009', '[[MTV', '..."
8212,8212,2009-09-14 01:47:28+00:00,313709562,CharleyHart,5875731,,58496,{{pp-semi|small=yes}}\n{{Infobox Musical artis...,Kanye_West,2009,9,"Kanye Omari West (; born June 8, 1977) is a co...","{'added': ['a', 'controversial,', 'self-concer..."
8211,8211,2009-09-14 01:48:51+00:00,313709840,Silly Demetrius,7341291,,58515,{{pp-semi|small=yes}}\n{{Infobox Musical artis...,Kanye_West,2009,9,"Kanye Omari West (; born June 8, 1977) is a co...","{'added': ['[[singer]],', 'and', 'major', 'ass..."
8210,8210,2009-09-14 01:49:00+00:00,313709865,Jdtink18,4473461,,58661,{{pp-semi|small=yes}}\n{{Infobox Musical artis...,Kanye_West,2009,9,"Kanye Omari West (; born June 8, 1977) is a co...","{'added': ['Kanye', 'is', 'also', 'well-known'..."


In [52]:
# check if in added in column text_diff the word 'controversy' or 'controversial' gets mentioned (count)
print(f"Controversy mentions for KW: {ky_df_VMA_after['text_diff'].apply(lambda x: any(word in ['controversy', 'controversial','Controversy','Controversial'] for word in map(str.lower, x['added']))).sum()}")
print(f"Controversy mentions for TS: {ts_df_VMA_after['text_diff'].apply(lambda x: any(word in ['controversy', 'controversial','Controversy','Controversial'] for word in map(str.lower, x['added']))).sum()}")


Controversy mentions for KW: 13
Controversy mentions for TS: 1


In [53]:
ts_df_VMA_before = ts_df[(ts_df['timestamp'] > date_one_week_before) & (ts_df['timestamp'] < VMA_date)]
ky_df_VMA_before = ky_df[(ky_df['timestamp'] > date_one_week_before) & (ky_df['timestamp'] < VMA_date)]

In [54]:
ts_df_VMA_before = ts_df_VMA_before.sort_values(by='timestamp')
ky_df_VMA_before = ky_df_VMA_before.sort_values(by='timestamp')

In [55]:
ts_last_before = ts_df_VMA_before.iloc[-1]['text']
ky_last_before = ky_df_VMA_before.iloc[-1]['text']

In [56]:
print(f"Controversy mentions for TS BEFORE VMAs: {ts_last_before.lower().count('controversy') + ts_last_before.lower().count('controversial')}")
print(f"Controversy mentions for KY BEFORE VMAs: {ky_last_before.lower().count('controversy') + ky_last_before.lower().count('controversial')}")

Controversy mentions for TS BEFORE VMAs: 0
Controversy mentions for KY BEFORE VMAs: 3


In [57]:
ts_df_VMA_after = ts_df[(ts_df['timestamp'] > VMA_date) & (ts_df['timestamp'] < date_one_week_after)]
ky_df_VMA_after = ky_df[(ky_df['timestamp'] > VMA_date) & (ky_df['timestamp'] < date_one_week_after)]

In [58]:
ts_df_VMA_after['userid'].value_counts(normalize=True).head(10)


userid
4293477     0.364865
2197222     0.081081
10508097    0.060811
10106521    0.054054
9154284     0.033784
6436929     0.020270
5627478     0.020270
4810503     0.020270
1495172     0.013514
9739013     0.013514
Name: proportion, dtype: float64

In [59]:
ky_df_VMA_after['userid'].value_counts(normalize=True).head(10)

userid
4293477    0.190789
2084988    0.085526
911178     0.046053
6436929    0.039474
42788      0.026316
1800521    0.026316
4322169    0.026316
2442447    0.026316
7581206    0.019737
1481857    0.019737
Name: proportion, dtype: float64

## Common Editors

In [None]:
#check for common editors
ky_editors_VMA = ky_df_VMA['userid'].unique()
ts_editors_VMA = ts_df_VMA['userid'].unique()

common_editors_VMA = set(ky_editors_VMA).intersection(ts_editors_VMA)

In [None]:
common_editors_VMA

{'10106521', '1800521', '4293477', '5627478', '6436929', '8675237'}

In [None]:
#most active editor
ky_df['userid'].value_counts()

userid
16899771    492
19493411    242
4293477     240
6661051     199
19507760    107
           ... 
14639338      1
33086960      1
24373534      1
33087499      1
1803961       1
Name: count, Length: 2886, dtype: int64

In [None]:
most_active_user_ts = ts_df['userid'].value_counts().idxmax()
ts_df['revision_id'].value_counts().sum()
#how many edits were made by most active user
ts_df[ts_df['userid'] == most_active_user_ts]['revision_id'].value_counts().sum()

np.int64(2375)

In [None]:
2375/ts_df['revision_id'].value_counts().sum()

np.float64(0.12305061913890472)

In [None]:
ky_df['revision_id'].value_counts().sum()

np.int64(9642)

# Taylor: Billboard

In [61]:
BB = datetime.strptime("2012-05-20", "%Y-%m-%d").replace(tzinfo=pytz.UTC)

In [62]:
date_one_week_before_BB = BB - timedelta(days=7)
date_one_week_after_BB = BB + timedelta(days=7)

In [63]:
ts_df_BB_after = ts_df[(ts_df['timestamp'] > BB) & (ts_df['timestamp'] < date_one_week_after_BB)]

In [64]:
ts_df_BB_after

Unnamed: 0,timestamp,revision_id,username,userid,comment,text_length,text,year,month,day
14139,2012-05-26 23:25:56+00:00,494534513,Popeye191,6138283,/* Natural disaster funds */,149698,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14140,2012-05-26 23:21:32+00:00,494533999,Popeye191,6138283,,149476,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14141,2012-05-26 23:13:46+00:00,494533134,Popeye191,6138283,/* Arts education */,149476,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14142,2012-05-26 23:11:30+00:00,494532875,Popeye191,6138283,/* Arts education */,149472,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14143,2012-05-26 23:06:25+00:00,494532257,Popeye191,6138283,/* Arts education */,149457,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
...,...,...,...,...,...,...,...,...,...,...
14259,2012-05-20 16:31:34+00:00,493520571,Popeye191,6138283,,137738,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14260,2012-05-20 08:41:46+00:00,493462759,Popeye191,6138283,"/* 2010–12: Speak Now release, 13-month world ...",137739,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14261,2012-05-20 08:39:01+00:00,493462472,Popeye191,6138283,"/* 2010–12: Speak Now release, 13-month world ...",137742,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05
14262,2012-05-20 08:34:38+00:00,493462061,Popeye191,6138283,"/* 2010–12: Speak Now release, 13-month world ...",137808,{{About|the musician|her self-titled debut alb...,Taylor_Swift,2012,05


In [65]:
#% of edits made by each userid
ts_df_BB_after['userid'].value_counts(normalize=True).head(10)

userid
6138283     0.672
16287261    0.112
12615911    0.064
16710       0.048
2530149     0.024
16849652    0.024
13096171    0.016
14457634    0.008
16300118    0.008
1423121     0.008
Name: proportion, dtype: float64

In [99]:
data = {
    'Date': ['One Week Before', 'One Week After'],
    'Kanye West Edits': [4, 152],
    'Taylor Swift Edits': [14, 148]
}
df = pd.DataFrame(data)

def calculate_percent_change(before, after):
    if before == 0:
        return "N/A"
    return f"+{((after - before) / before * 100):.0f}%"

# Create figure for total edits
fig_edits = go.Figure()

# Add bars for edits
fig_edits.add_trace(
    go.Bar(
        name='Kanye West',
        x=df['Date'],
        y=df['Kanye West Edits'],
        text=[f"{y}<br>({calculate_percent_change(4, y)})" if i == 1 else y 
              for i, y in enumerate(df['Kanye West Edits'])],
        textposition='auto',
        marker_color='#CCA677',
        opacity=0.95,
        width=0.3
    )
)

fig_edits.add_trace(
    go.Bar(
        name='Taylor Swift',
        x=df['Date'],
        y=df['Taylor Swift Edits'],
        text=[f"{y}<br>({calculate_percent_change(14, y)})" if i == 1 else y 
              for i, y in enumerate(df['Taylor Swift Edits'])],
        textposition='auto',
        marker_color='#5D4037',
        opacity=0.95,
        width=0.3
    )
)

# Update layout for edits
fig_edits.update_layout(
    title={
        'text': 'Wikipedia Page Edits Before and After VMAs',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 24}
    },
    yaxis_title="Number of Edits",
    showlegend=True,
    legend={'orientation': 'h', 'y': -0.2},
    template='plotly_white',
    barmode='group',
    hoverlabel=dict(bgcolor="white"),
    margin=dict(t=50, b=50),
    height=500,
    annotations=[
    ],
    font_family = "Raleway",
    font_color = "#5D4037",

)

# Show both figures
fig_edits.show()

# If you want to save the figures
# fig_edits.write_html("vma_edits_analysis.html")
# fig_controversy.write_html("vma_controversy_analysis.html")