## Import libraries

In [3]:
!pip install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.4.3.20220106-py3-none-any.whl (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.1 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.4.3.20220106


In [4]:
import snscrape.modules.twitter as sntwitter
import datetime as dt
import pandas as pd


## Scrape tweets

Scrape 365k positive (using ":)" as the query) and 365k negative (using ":(" as the query) tweets from year 2022. We will scrape 1000 tweets per day for each category.

In [None]:
def scrape_tweet(search_term, start_date, end_date, num_tweets):
    start_date = start_date.strftime("%Y-%m-%d")
    end_date = end_date.strftime("%Y-%m-%d")
    tweet_data = []
    for i, tweet in enumerate(
        sntwitter.TwitterSearchScraper(
            "{} since:{} until:{} lang:en exclude:retweets".format(
                search_term, start_date, end_date
            )
        ).get_items()
    ):
        if i >= num_tweets:
            break
        tweet_data.append([tweet.user.username, tweet.date, tweet.content])
    tweet_df = pd.DataFrame(tweet_data, columns=["username", "date", "tweet"])
    return tweet_df

In [None]:
def daily_scrape_2022(search_term, num_daily):
    start_date = dt.datetime(2022, 1, 1)
    end_date = dt.datetime(2022, 1, 2)
    delta = dt.timedelta(days=1)
    df = pd.DataFrame()
    for n in range(365):
        temp_df = scrape_tweet(search_term, start_date, end_date, num_daily)
        df = pd.concat([df, temp_df])
        start_date += delta
        end_date += delta
    return df

In [None]:
ori_neg_df = daily_scrape_2022(":(", 1000)


In [None]:
ori_pos_df = daily_scrape_2022(":)", 1000)


## Filter scraped tweets

In [None]:
def filter_include(df, term_list):
    temp_df = pd.DataFrame()
    for term in term_list:
        add_df = df[df["tweet"].str.contains(term, regex=False) == True]
        temp_df = pd.concat([temp_df, add_df]).drop_duplicates(ignore_index=True)
    return temp_df

In [None]:
def filter_exclude(df, term_list):
    temp_df = df.copy()
    for term in term_list:
        temp_df = temp_df[temp_df["tweet"].str.contains(term, regex=False) == False]
    return temp_df

Filter negative tweet



In [None]:
neg_df = filter_include(ori_neg_df, [":(", ":-("])
neg_df = filter_exclude(neg_df, [":)", ":D", ":-)"])
neg_df.shape

(358624, 3)

Filter positive tweet 

In [None]:
pos_df = filter_include(ori_pos_df, [":)", ":D", ":-)"])
pos_df = filter_exclude(pos_df, [":(", ":-("])
pos_df.shape

(343477, 3)

## Remove emojis from tweets

Remove all emojis because we want our model to classify the tweet sentiment from the text instead of emojis

In [None]:
def remove_term(df, term_list):
    temp_df = df.copy()
    for term in term_list:
        temp_df["tweet"] = temp_df["tweet"].str.replace(term, " ", regex=False)
    return temp_df

In [None]:
neg_df = remove_term(neg_df, [":(", ":-("])

In [None]:
pos_df = remove_term(pos_df, [":)", ":D", ":-)"])

## Label tweets and merge them into a dataframe

In [None]:
neg_df["sentiment"] = "Negative"
pos_df["sentiment"] = "Positive"
df = pd.concat([neg_df, pos_df]).reset_index(drop=True)

In [None]:
df.to_csv("/content/drive/MyDrive/dataset/labeled_tweets.csv", index=False)