#                            -----            Twitter scraper with snscrape            -----

This notebook is about scraping tweets from Twitter with key words and storing them in clean csv datasets, in order to used for data analysis/machine learning. It works with the webscraper 'snscrape', without the official Twitter API.

The created datasets contain the following columns: 

- date (datetime)
- text (str)
- number of retweets (float)
- number of likes (float)
- number of views (float) (since december 2022)
- language (str)
- username (str)
- number of followers of the user (float)

In [1]:
#pip install snscrape
#pip install pandas
#pip install tqdm

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
import time
import os
from tqdm.notebook import tqdm # make your loops show a progress meter
import itertools               #library for loops

## Extraction of raw data

- Defining the query 

In [3]:
def format_query(company, start_date, end_date):
    
    #add a function for compound names (example : 'Arcelor_Mittal' -> 'Areclor_Mittal' + 'ArcelorMittal')
    words = company.split("_")
    if len(words) > 1:
        company = '"' + company + '"  OR ' + "".join(words)
    else:
        company = company
    
    return f"{company} since:{start_date} until:{end_date}"
    

- Getting a list of objects 'tweets' with the snscrape function and putting them in a dataframe

In [4]:
def search_tweets(query, maxTweets):
    
    df = pd.DataFrame(tqdm(itertools.islice(sntwitter.TwitterSearchScraper(query).get_items(),maxTweets), total=maxTweets))
    df = df[['date', 'user','renderedContent', 'retweetCount', 'likeCount', 'viewCount','lang']]
    df.rename(columns = {'retweetCount':'retweets', 'likeCount':'likes','viewCount':'views', 'renderedContent':'text'}, inplace = True)
    return df

## Data cleaning

In [5]:
import re

- Removing the links and the @ with regex. Also removing the useless spaces and empty lines

In [6]:
def Cleaning(df):    
    
    regex_links_arobase = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))|@\w+|(https?:\/\/)?(www\.)?[a-z0-9-]+\.(com|org)(\.[a-z]{2,3})?"
    df['text'] = df['text'].apply(lambda x: re.sub(regex_links_arobase, ' ', x).strip())
    
    regex_space = r'\s+'
    df['text'] = df['text'].apply(lambda x: re.sub(regex_space, ' ', x).strip())
    
    df['text'] = df['text'].apply(lambda x: re.sub(r'\n{2,}', '\n', x))
    
    df = df.dropna(subset=['text'], axis=0).reset_index(drop=True)
    
    df = df.astype({"text": 'str'})
    return df


- Adding the followers and username column from the information in the user column

In [7]:
def add_username_and_followers(df):
    
    usernames = []
    followers = []
    
    for index, row in df.iterrows():
        user_data = row['user']
        
        usernames.append(user_data['username'])
        followers.append(user_data['followersCount'])
    
    df['username'] = usernames
    df['followers'] = followers
    
    df.drop('user', axis=1, inplace=True)
    
    return df

## Saving data

- Saving tweets in a csv file

In [8]:
def save_tweets_to_csv(name, df):
    # create 'tweets' folder if it doesn't exist
    if not os.path.exists("tweets"):
        os.makedirs("tweets")
    
    name=name.lower()
    folder_path = "tweets"
    
    #Add a number after the name if it already exists
    i = 0
    file_path = os.path.join(folder_path, f"{name}_tweets.csv")
    while os.path.exists(file_path):
        i += 1
        file_path = os.path.join(folder_path, f"{name}_tweets{i}.csv")

    df.to_csv(file_path, index=False)
    print(f"The file {file_path} was successfully created.")


- Main function : calling all the functions before

In [9]:
def Scraping(company, start_date, end_date, maxTweets):
    
    query = format_query(company, start_date, end_date)
    df_raw = search_tweets(query, maxTweets)
    df_clean = Cleaning(df_raw)
    df_clean = df_clean.dropna(subset=['text'], axis=0).reset_index(drop=True)
    df_final = add_username_and_followers(df_clean)
    save_tweets_to_csv(company, df_final)
    
    return df_clean

## User part

- We fill the 4 following information : 

    - List of the names to search
    - start date
    - end date
    - maximum number of tweets per name. 

In [10]:
comp_list = ['air_france', 'credit_agricole']

start_date= "2022-12-01"
end_date = "2023-01-01"
maxTweets = 30

- Run the main function

In [11]:
for comp in tqdm(comp_list, desc = "Progression :  "):
    Scraping(comp, start_date, end_date, maxTweets)

Progression :  :   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

The file tweets\air_france_tweets3.csv was successfully created.


  0%|          | 0/30 [00:00<?, ?it/s]

The file tweets\credit_agricole_tweets.csv was successfully created.
