# Scraping Tweets From Twitter Using SnScrape

- We scrape tweets using snscrape and using pandas library for data manipulation


In [None]:
# Run the pip install command below if you don't already have the library to install
!pip install git+https://github.com/JustAnotherArchivist/snscrape.git

In [None]:
# Run the below command if you don't already have Pandas
!pip install pandas

In [None]:
# Import packages
import snscrape.modules.twitter as sntwitter
import pandas as pd
import time
import json

## Scrape user's tweets

- Change maxTweets to limit number of tweets scraped and change the userID to scrape from user timeline

- Note: The number of a user tweets displayed include the number of tweets being tweeted by the user and the number of tweets retweeted by the user. However, due to the nature of twitter, we can only collect tweets written by the user, not the one being tweeted by that user. 

In [None]:
# Setting variables to be used below
maxTweets = 30000

# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:PwCUS').get_items()):
# use this to specify a specific timeframe to scrape the tweets
#for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:PwCUS since:2008-01-01 until:2014-03-01').get_items()):
    if i > maxTweets:
        break
    if i % 100 == 0:
        print(i)
    tweets_list.append([tweet.user.username, tweet.date, tweet.id, tweet.content,
                         tweet.replyCount, tweet.retweetCount, tweet.likeCount,
                         tweet.quoteCount, tweet.mentionedUsers,
                         tweet.url])
    time.sleep(0.2)

In [None]:
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['UserID','DateTime', 'TweetID', 'Text',
                                                'ReplyCount','RetweetCount','LikeCount',
                                                'QuoteCount','MentionedUsers','Links'])
tweets_df

In [None]:
Date = pd.to_datetime(tweets_df["DateTime"]).dt.date
Time = pd.to_datetime(tweets_df["DateTime"]).dt.time
tweets_df.insert(1, "Date", Date)
tweets_df.insert(2, "Time", Time)
tweets_df = tweets_df.drop(["DateTime"],axis=1)
tweets_df

In [None]:
# Export dataframe into a CSV
tweets_df.to_csv('PwCUS-tweets.csv', sep=',', index=False)

## Scrape tweets from text search query

- Change maxTweets to limit number of tweets scraped and change the userID to scrape from user timeline.
- Use time.sleep() to avoid getting blocked by Twitter website anti web crawlers

In [None]:
# Setting variables to be used below
maxTweets = 1000000

# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('@PwCUS since:2009-03-01 until:2021-04-20').get_items()):
    if i > maxTweets:
        break
    if i % 100 == 0:
        print(i)
    tweets_list.append([tweet.user.username, tweet.date, tweet.id, tweet.content,
                         tweet.replyCount, tweet.retweetCount, tweet.likeCount,
                         tweet.quoteCount, tweet.mentionedUsers,
                         tweet.url])
    #time.sleep(0.01)

In [None]:
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['UserID','DateTime', 'TweetID', 'Text',
                                                'ReplyCount','RetweetCount','LikeCount',
                                                'QuoteCount','MentionedUsers','Links'])
tweets_df

In [None]:
Date = pd.to_datetime(tweets_df["DateTime"]).dt.date
Time = pd.to_datetime(tweets_df["DateTime"]).dt.time
tweets_df.insert(1, "Date", Date)
tweets_df.insert(2, "Time", Time)
tweets_df = tweets_df.drop(["DateTime"],axis=1)
tweets_df

In [None]:
# Export dataframe into a CSV
tweets_df.to_csv('@PwCUS.csv', sep=',', index=False)