# Sentiment Analysis of Tweets

### Chichi, Mariana, Nate, and Taylor

## Part 1: Necessary import statements and loading the data

In [48]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime
from datetime import timedelta

#### Importing the datasets and cleaning the data a bit

In [49]:
tweets = pd.read_csv("./tweetrealdata.csv", header=None)

wikipedia_data = pd.read_csv("./wikipedia_master.csv")
wikipedia_data.drop(columns=["Unnamed: 0"], inplace=True)
seasons_dates = wikipedia_data[["Season", "Date Aired"]]

#### Checking to see that the dates are formatted correctly

In [50]:
tweets_text = tweets[1]
dates = tweets[5]
dates

0        2021-11-30 00:58:21+00:00
1        2021-11-30 00:57:33+00:00
2        2021-11-30 00:57:04+00:00
3        2021-11-30 00:53:43+00:00
4        2021-11-30 00:53:41+00:00
                   ...            
11888    2020-11-16 20:16:53+00:00
11889    2020-11-16 20:11:35+00:00
11890    2020-11-16 20:09:07+00:00
11891    2020-11-16 20:07:16+00:00
11892    2020-11-16 20:07:16+00:00
Name: 5, Length: 11893, dtype: object

#### Changing all dates in the `seasons_dates` DataFrame to datetime objects to match those in the tweets. `datetime` objects are useful because you can perform very easy comparisons (i.e. date1 == date2 and it's all standardized.

In [51]:
dt_dates = []
for i in range(len(seasons_dates)):
    date = seasons_dates.iat[i, 1]
    date = date.split("/")
    for i in range(len(date)):
        if len(date[i]) == 1:
            date[i] = "0" + date[i]
    date = "/".join(date)

    dt_dates.append(datetime.strptime(date, "%m/%d/%y"))

ep_dates = pd.DataFrame(data={"Date Aired": dt_dates})

seasons_dates.drop(columns=["Date Aired"], inplace=True)
seasons_dates = seasons_dates.join(ep_dates)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


#### Here we can see the seasons and the dates all standardized

In [52]:
seasons_dates[50:70]

Unnamed: 0,Season,Date Aired
50,14,2018-06-04
51,14,2018-06-11
52,14,2018-06-18
53,14,2018-06-25
54,14,2018-07-02
55,14,2018-07-09
56,14,2018-07-16
57,14,2018-07-23
58,14,2018-08-06
59,15,2019-05-13


#### Creating a new list called `tweets_clean` which has the format (text of the tweet, season #, and the date of the tweet). 

In [53]:
tweets_clean = []

for i in range(len(tweets)):
    
    # Extracting the text of the tweet
    tweet = tweets_text.iat[i]
    
    # Removing retweets
    if tweet[:2] == "RT":
        continue

    # extracting the date from the tweet as a datetime object
    tweet_date = dates[i]
    tweet_date = tweet_date[:10]
    tweet_date = datetime.strptime(tweet_date, "%Y-%m-%d")
    
    one_day = timedelta(days=7)
    
    
    season = None
    date = None

    for i in range(len(seasons_dates)):
        # Since some people often tweet on days that episodes don't air, 
        # so this goes to match the correct season with the date of the tweet, give or take a week. 
        # Since no two seasons occur within a week of each other, any tweet that happens a week within 
        # any episode of a season is matched with that season. 
        if tweet_date == seasons_dates.iat[i, 1] or tweet_date == seasons_dates.iat[i, 1] + one_day or tweet_date == seasons_dates.iat[i, 1] - one_day:
            date = tweet_date
            season = seasons_dates.iat[i, 0]
    
    tweets_clean.append((tweet, season, tweet_date))

tweets_clean


[('BYE MARTIN I HOPE THE DOOR HITS YOU ON THE WAY OUT #FUCKYOU #TheBachelorette',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('Wow, just getting caught up on #TheBachelorette and man, Michelle is just getting no respect from these guys. Like none.',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('https://t.co/DnnFOU0dmU #TheBachelorette',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('#TheBachelorette yo fuck Martin toxic ass bitch',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('Martin walked so Chris S. Could run #TheBachelorette',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('#TheBachelorette  Bachelorette Spoilers: Who did Michelle Young pick as her winner &amp; runner-up? Who ended up as… https://t.co/Gng3mOPXe9',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ('🗣We need #TheBachelorette back on Monday nights🗣',
  18,
  datetime.datetime(2021, 11, 30, 0, 0)),
 ("if you're not sweating are you actually working out? go challenge Tylerjcameron3 and get to the 

# Part 2: Sentiment Analysis

In [54]:
import re

sentiment_analysis_scores = []

sia = SentimentIntensityAnalyzer()

for i in range(len(tweets_clean)):
    # Extracting the text of the tweets
    text = tweets_clean[i][0]
    
    # removing hashtags, links, and "TheBachelorette hashtag"
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split()) # For removing hashtags and @s
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Removing links
    text = text.replace("TheBachelorette", "") # Since every tweet has "TheBachelorette" (from the hashtag), this just removes it
    
    season = tweets_clean[i][1]
    date = tweets_clean[i][2]
    scores = sia.polarity_scores(text)
    
    # Keeps tweets_clean, but adds a new field to each entry which contains the scores from the sentiment analysis
    tweets_clean[i] = (text, season, date, scores)

# puttin the data in a dataframe so that we can save it as a csv
tweets_clean_df = pd.DataFrame(data=tweets_clean, columns=["Text", "Season", "Date", "Sentiment Analysis"])
tweets_clean_df.head()
tweets_clean

[('BYE MARTIN I HOPE THE DOOR HITS YOU ON THE WAY OUT FUCKYOU ',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'compound': 0.4404}),
 ('Wow just getting caught up on  and man Michelle is just getting no respect from these guys Like none',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.083, 'neu': 0.564, 'pos': 0.353, 'compound': 0.802}),
 ('',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}),
 (' yo fuck Martin toxic ass bitch',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.783, 'neu': 0.217, 'pos': 0.0, 'compound': -0.8957}),
 ('Martin walked so Chris S Could run ',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 (' Bachelorette Spoilers Who did Michelle Young pick as her winner amp runner up Who ended up as',
  18,
  datetime.datetime(2021, 11, 30, 0, 0),
  {'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'compou

#### Saving as a csv file for durability. The filename here is different so that this code, if rerun, doesn't overwrite manually cleaned csv which is used later on.

In [55]:
tweets_clean_df.to_csv("./twitter_data_not_manually_cleaned.csv")