In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
import string
import warnings

In [2]:
# for all NLP related operations on text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.classify import NaiveBayesClassifier
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [5]:
# To mock web-browser and scrap tweets
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [8]:
# To consume Twitter's API
import tweepy
from tweepy import OAuthHandler 

In [10]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [11]:
# To identify the sentiment of text
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

In [12]:
# ignoring all the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [13]:
# downloading stopwords corpus
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('conll2000')
nltk.download('brown')
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/zaens/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zaens/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/zaens/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zaens/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/zaens/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/zaens/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package conll2000 to /home/zaens/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package brown to /home/zaens/nl

In [14]:
# for showing all the plots inline
%matplotlib inline

In [33]:
class SeleniumClient(object):
    def __init__(self):
        #Initialization method. 
        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--no-sandbox')
        self.chrome_options.add_argument('--disable-setuid-sandbox')

        # you need to provide the path of chromdriver in your system
        self.browser = webdriver.Chrome(options=self.chrome_options)

        self.base_url = 'https://x.com/search?q='

    def get_tweets(self, query):
        #Function to fetch tweets. 
        try: 
            self.browser.get(self.base_url+query)
            time.sleep(2)

            body = self.browser.find_element_by_tag_name('body')

            for _ in range(3000):
                body.send_keys(Keys.PAGE_DOWN)
                time.sleep(0.3)

            # timeline = self.browser.find_element_by_id('timeline')
            # tweet_nodes = timeline.find_elements_by_css_selector('[data-testid="tweetText"]')

            tweet_nodes = self.browser.find_elements(By.CSS_SELECTOR, '[data-testid="tweetText"]')
            # time = self.browser.find_elements

            return pd.DataFrame({'tweets': [tweet_node.text for tweet_node in tweet_nodes]})

        
        except:
            print("Selenium - An error occured while fetching tweets.")


In [61]:
class TwitterClient(object): 
    def __init__(self): 
        #Initialization method. 
        try: 
            key = "waFDOZSdeF6A0vZoiXFIJmVr6"
            secret = "mOkLNM0BekQEduzj8Gn9RxXqZIa80oaoUJKweOxMRuktxmnNuZ"
            access_token = "1492109235337175041-AAaO1RheWIGUsWBA7ecaoEUZLNymgG"
            access_token_secret = "TchFoUVuuFQf3qKVFncVRxqK7X7ZZs4bcJ7jFILkbpJgh"
            # create OAuthHandler object 
            auth = OAuthHandler(key, secret) 
            # set access token and secret 
            auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            # add hyper parameter 'proxy' if executing from behind proxy "proxy='http://172.22.218.218:8085'"
            self.api = tweepy.API(auth, wait_on_rate_limit=True)
            # self.api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
            
        except :
        # except tweepy.TweepError as e:
            print(f"Error: Tweeter Authentication Failed - ")

    def get_tweets(self, query, maxTweets = 1000):
        #Function to fetch tweets. 
        # empty list to store parsed tweets 
        tweets = [] 
        sinceId = None
        max_id = -1
        tweetCount = 0
        tweetsPerQry = 100

        while tweetCount < maxTweets:
            # try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = self.api.search_tweets(q=query, count=tweetsPerQry)
                else:
                    new_tweets = self.api.search_tweets(q=query, count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if (not sinceId):
                    new_tweets = self.api.search_tweets(q=query, count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = self.api.search_tweets(q=query, count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break

            for tweet in new_tweets:
                parsed_tweet = {} 
                parsed_tweet['tweets'] = tweet.text 

                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 
                    
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id

            # except :
            # # except tweepy.TweepError as e:
            #     # Just exit if any error
            #     # print("Tweepy error : " + str(e))
            #     print("error")
            #     break
        
        return pd.DataFrame(tweets)

In [32]:
selenium_client = SeleniumClient()

# calling function to get tweets
tweets_df = selenium_client.get_tweets('AI and Deep learning')
tweets = pd.DataFrame(tweets_df)
print(f'tweets_df Shape - {tweets.shape}')
tweets.head(10)

Selenium - An error occured while fetching tweets.
tweets_df Shape - (0, 0)


In [62]:
twitter_client = TwitterClient()

# calling function to get tweets
tweets_df = twitter_client.get_tweets('AI and Deep learning', maxTweets=7000)
print(f'tweets_df Shape - {tweets_df.shape}')
tweets_df.head(10)

Forbidden: 403 Forbidden
453 - You currently have access to a subset of Twitter API v2 endpoints and limited v1.1 endpoints (e.g. media post, oauth) only. If you need access to this endpoint, you may need a different access level. You can learn more here: https://developer.twitter.com/en/portal/product

In [52]:
def tweets(): 
    #Initialization method. 
    # try: 
    key = "waFDOZSdeF6A0vZoiXFIJmVr6"
    secret = "mOkLNM0BekQEduzj8Gn9RxXqZIa80oaoUJKweOxMRuktxmnNuZ"
    access_token = "1492109235337175041-AAaO1RheWIGUsWBA7ecaoEUZLNymgG"
    access_token_secret = "TchFoUVuuFQf3qKVFncVRxqK7X7ZZs4bcJ7jFILkbpJgh"
    # create OAuthHandler object 
    auth = OAuthHandler(key, secret) 
    print(auth)
    # set access token and secret 
    auth.set_access_token(access_token, access_token_secret) 
    # create tweepy API object to fetch tweets 
    # add hyper parameter 'proxy' if executing from behind proxy "proxy='http://172.22.218.218:8085'"
    api = tweepy.API(auth, wait_on_rate_limit=True)
        
    # except :
    # # except tweepy.TweepError as e:
    #     print(f"Error: Tweeter Authentication Failed - ")

In [53]:
tweets()

<tweepy.auth.OAuthHandler object at 0x7f24a5221450>
