In [25]:
import mwclient  # Library for interacting with MediaWiki-based wikis, such as Wikipedia
import time  # Library for time-related functions
from transformers import pipeline  # Library for accessing pre-trained models for various NLP tasks
from statistics import mean  # Library for statistical calculations, such as mean
import pandas as pd  # Library for data manipulation and analysis
from datetime import datetime  # Library for handling dates and times

# Finding sentiments of page edits

In [3]:
# Connect to the English Wikipedia site
site = mwclient.Site('en.wikipedia.org')

# Access the Wikipedia page for 'Bitcoin'
page = site.pages['Bitcoin']


In [4]:
# Retrieve and list all revisions of the 'Bitcoin' Wikipedia page
revs = list(page.revisions())


In [5]:
# Access the most recent revision of the 'Bitcoin' Wikipedia page
revs[0]


OrderedDict([('revid', 1240407957),
             ('parentid', 1239083268),
             ('user', 'Citation bot'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=8, tm_mday=15, tm_hour=6, tm_min=35, tm_sec=19, tm_wday=3, tm_yday=228, tm_isdst=-1)),
             ('comment',
              'Altered title. Add: bibcode, authors 1-1. Removed parameters. Some additions/deletions were parameter name changes. | [[:en:WP:UCB|Use this bot]]. [[:en:WP:DBUG|Report bugs]]. | Suggested by Abductive | [[Category:Bitcoin]] | #UCB_Category 40/42')])

In [6]:
# Sort the list of revisions by their timestamp in ascending order
revs = sorted(revs, key=lambda rev: rev['timestamp'])


In [7]:
# Access the earliest revision of the 'Bitcoin' Wikipedia page after sorting by timestamp
revs[0]


OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [8]:
# Initialize a sentiment analysis pipeline from Hugging Face's transformers library
sentiment_pipeline = pipeline('sentiment-analysis')

def find_sentiment(text):
    """
    Analyzes the sentiment of a given text.

    Parameters:
    - text (str): The text to analyze for sentiment.

    Returns:
    - float: The sentiment score of the text. Negative score indicates negative sentiment.
    """
    # Analyze sentiment of the first 250 characters of the text
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent['score']  # Get the sentiment score
    if sent['label'] == 'NEGATIVE':  # If sentiment is negative, invert the score
        score *= -1
    return score


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)
  return self.randrange(a, b+1)

In [9]:
# Analyze the sentiment of the phrase 'i love you'
find_sentiment('i love you')


0.9998656511306763

In [10]:
# Analyze the sentiment of the phrase 'i hate you'
find_sentiment('i hate you')


-0.9991129040718079

In [11]:
# Initialize an empty dictionary to store edits or modifications
edits = {}


In [12]:
# Process each revision to categorize edits by date and analyze sentiment of edit comments
for rev in revs:
    # Format the revision timestamp as a date string
    date = time.strftime('%Y-%m-%d', rev["timestamp"])
    
    # Initialize the dictionary entry for the date if it doesn't exist
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    # Increment the edit count for the date
    edits[date]['edit_count'] += 1
    
    # Retrieve and analyze the comment from the revision
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))


In [18]:
# Calculate average sentiment and proportion of negative sentiments for each date
for key in edits:
    if len(edits[key]['sentiments']) > 0:
        # Compute the average sentiment score for the date
        edits[key]['sentiment'] = mean(edits[key]['sentiments'])
        # Calculate the proportion of negative sentiments
        edits[key]['neg_sentiment'] = len([s for s in edits[key]['sentiments'] if s < 0]) / len(edits[key]['sentiments'])
    else:
        # If no sentiments were recorded, set sentiment values to 0
        edits[key]['sentiment'] = 0
        edits[key]['neg_sentiment'] = 0
    
    # Remove the raw sentiment list, as it's no longer needed
    del edits[key]['sentiments']


In [19]:
# Display the final dictionary containing the processed edit information
edits


{'2009-03-08': {'edit_count': 4,
  'sentiment': -0.550525039434433,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'sentiment': 0.7481208443641663,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'sentiment': 0.995745837688446,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'sentiment': 0.9300214052200317,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'sentiment': -0.22749891877174377,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'sentiment': 0.8839514255523682,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'sentiment': 0.8758782148361206,
  'neg_sentiment': 0.0},
 '2010-04-13': {'edit_count': 4,
  'sentiment': 0.844356253743171

# Converting sentiment data into dataframe

In [21]:
# Convert the 'edits' dictionary into a pandas DataFrame, with dates as the index
edits_df = pd.DataFrame.from_dict(edits, orient='index')


In [22]:
# Display the DataFrame containing the processed edit information
edits_df


Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-08-05,1,0.748121,0.00
2009-08-06,2,0.995746,0.00
2009-08-14,1,0.930021,0.00
2009-10-13,2,-0.227499,0.50
...,...,...,...
2024-07-28,2,-0.008737,0.50
2024-07-29,1,-0.696867,1.00
2024-07-31,1,-0.999386,1.00
2024-08-07,1,0.997663,0.00


In [24]:
# Convert the index of the DataFrame to datetime format for easier time series analysis
edits_df.index = pd.to_datetime(edits_df.index)


In [27]:
# Generate a date range from March 8, 2009, to today's date
dates = pd.date_range(start="2009-03-08", end=datetime.today())


In [28]:
# Display the generated date range
dates


DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2024-08-07', '2024-08-08', '2024-08-09', '2024-08-10',
               '2024-08-11', '2024-08-12', '2024-08-13', '2024-08-14',
               '2024-08-15', '2024-08-16'],
              dtype='datetime64[ns]', length=5641, freq='D')

In [29]:
# Reindex the DataFrame to include all dates in the generated date range, filling missing values with 0
edits_df = edits_df.reindex(dates, fill_value=0)


In [30]:
# Display the DataFrame with reindexed dates and filled missing values
edits_df


Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-08-12,0,0.000000,0.00
2024-08-13,0,0.000000,0.00
2024-08-14,0,0.000000,0.00
2024-08-15,1,-0.999610,1.00


In [33]:
# Calculate the 15-day rolling mean of the DataFrame to smooth the time series data
rolling_edits = edits_df.rolling(15).mean()


In [34]:
# Display the DataFrame with 15-day rolling mean applied
rolling_edits


Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2024-08-12,0.200000,-0.046573,0.133333
2024-08-13,0.133333,-0.000115,0.066667
2024-08-14,0.133333,-0.000115,0.066667
2024-08-15,0.133333,-0.000130,0.066667


In [35]:
# Remove rows with missing values resulting from the rolling mean calculation
rolling_edits = rolling_edits.dropna()


In [36]:
# Display the DataFrame with rolling mean applied and missing values removed
rolling_edits


Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-22,0.266667,-0.036702,0.050000
2009-03-23,0.000000,0.000000,0.000000
2009-03-24,0.000000,0.000000,0.000000
2009-03-25,0.000000,0.000000,0.000000
2009-03-26,0.000000,0.000000,0.000000
...,...,...,...
2024-08-12,0.200000,-0.046573,0.133333
2024-08-13,0.133333,-0.000115,0.066667
2024-08-14,0.133333,-0.000115,0.066667
2024-08-15,0.133333,-0.000130,0.066667


In [37]:
# Save the DataFrame with rolling mean to a CSV file
rolling_edits.to_csv("D:/VSCode Folders/Bitcoin Prediction/wikipedia edits.csv")
