# 🧪 Scraping Test - Pipeline de scraping (étape 1)

Ce notebook a pour objectif de tester le pipeline de scraping sur un échantillon de produits depuis eBay.

🎯 **Objectif** :
- Identifier les balises HTML utiles
- Tester la récupération des informations (titre, prix, lien)
- Éviter la surcharge mémoire, disque et processeur
- Valider la faisabilité du scraping

🔄 **Pipeline de scraping (résumé)** :
1. Définir les sites à scraper
2. Inspecter la structure HTML des pages
3. Identifier les sélecteurs (balises CSS/HTML)
4. Automatiser avec `requests` et `BeautifulSoup`
5. Limiter à quelques résultats (`limit=5`)
6. Ne pas stocker localement pour ce test


## IMPORTAION BIBLIOTHEQUE

In [7]:
# 📦 Installer toutes les bibliothèques nécessaires
#!pip install tweepy pandas beautifulsoup4 requests lxml snscrape


In [5]:
import tweepy
import pandas as pd
import requests
from bs4 import BeautifulSoup
import snscrape.modules.twitter as sntwitter
import re
from collections import Counter


## TWITTER SCRAPIING

In [10]:
from dotenv import load_dotenv
import os

load_dotenv()
BEARER_TOKEN = os.getenv("BEARER_TOKEN")

print("Token chargé :", BEARER_TOKEN is not None)


Token chargé : True


In [13]:
client = tweepy.Client(bearer_token=BEARER_TOKEN)
query = "(#beauty OR #fashion OR #tech OR #food) -is:retweet lang:en"
tweets = client.search_recent_tweets(query=query, max_results=5)

for tweet in tweets.data:
    print(tweet.text)


TooManyRequests: 429 Too Many Requests
Too Many Requests

In [3]:
import tweepy

# Tes clés d'API Twitter (copie/colle les tiennes)
BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAALfq1wEAAAAAGJFkMSLlYdltHwckflUqwWLooGk%3DHlcJoKFxuftBXvCvVL1pH0bh3EBj3VlugSBZYzZV3dYrOxXacw"

# Connexion à l'API avec Tweepy v2
client = tweepy.Client(bearer_token=BEARER_TOKEN)

# Exemple simple : rechercher les tweets récents contenant certains hashtags dans 4 catégories (beauty, fashion, tech, food)

query = "(#beauty OR #fashion OR #tech OR #food) -is:retweet lang:en"

# Récupérer les 10 tweets récents
tweets = client.search_recent_tweets(query=query, max_results=10, tweet_fields=['public_metrics', 'created_at'])

for tweet in tweets.data:
    print(f"Tweet ID: {tweet.id}")
    print(f"Date: {tweet.created_at}")
    print(f"Texte: {tweet.text}")
    print(f"Likes: {tweet.public_metrics['like_count']}, Retweets: {tweet.public_metrics['retweet_count']}")
    print("-" * 50)




Tweet ID: 1926399545769963664
Date: 2025-05-24 22:06:57+00:00
Texte: “𝑩𝒆𝒂𝒖𝒕𝒚 𝒘𝒊𝒍𝒍 𝒔𝒂𝒗𝒆 𝒕𝒉𝒆 𝒘𝒐𝒓𝒍𝒅.” ~𝑭𝒚𝒐𝒅𝒐𝒓 𝑫𝒐𝒔𝒕𝒐𝒆𝒗𝒔𝒌𝒚 🧡 #art #beauty #faith #hope #love #inspiration #thoughts #journal #diary #nature
https://t.co/2PeV5n0YvM
Likes: 1, Retweets: 0
--------------------------------------------------
Tweet ID: 1926399530565603820
Date: 2025-05-24 22:06:53+00:00
Texte: #Beach #Breeze #Beauty

#merch with #digitaltease #digitalart now on your favourite #products via @redbubble
👇
https://t.co/qgBVuYC8Qz

#digitalart #shop #printondemand #findyourthing #giftideas #redbubble #shopfromhome #retailtherapy #onlineshopping #presents #gift https://t.co/tn6KIPJ3EA
Likes: 0, Retweets: 0
--------------------------------------------------
Tweet ID: 1926399478430322969
Date: 2025-05-24 22:06:41+00:00
Texte: Need a #tee that says “Everything will be ok coz I’m here”? Ready to rock your #style with a dash of confidence? Who’s in for some comfy #vibes? #fashion #hikerunner
https://t.co/DsApBV9htI https://t.co

In [1]:
import time
import tweepy
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()

# Charger les tokens depuis le .env (deux tokens ici)
BEARER_TOKENS = [
    os.getenv("BEARER_TOKEN_1"),
    os.getenv("BEARER_TOKEN_2")
]

def get_client(token):
    return tweepy.Client(bearer_token=token)

def search_tweets_with_pagination(tokens, query, max_results=10, max_pages=5):
    tweets_data = []
    next_token = None
    pages_fetched = 0
    token_index = 0
    client = get_client(tokens[token_index])
    
    while pages_fetched < max_pages:
        try:
            response = client.search_recent_tweets(
                query=query,
                max_results=max_results,
                tweet_fields=['public_metrics', 'created_at', 'lang'],
                next_token=next_token
            )
            if not response.data:
                print("Plus de tweets disponibles.")
                break
            
            for tweet in response.data:
                tweets_data.append({
                    "id": tweet.id,
                    "date": tweet.created_at,
                    "texte": tweet.text,
                    "likes": tweet.public_metrics['like_count'],
                    "retweets": tweet.public_metrics['retweet_count'],
                    "langue": tweet.lang
                })
            
            pages_fetched += 1
            print(f"Page {pages_fetched} récupérée, total tweets: {len(tweets_data)}")
            
            next_token = response.meta.get('next_token', None)
            if not next_token:
                print("Fin de la pagination.")
                break
            
            time.sleep(1)
        
        except tweepy.TooManyRequests:
            print(f"Limite API atteinte pour token {token_index + 1}, changement de token...")
            token_index += 1
            if token_index >= len(tokens):
                print("Tous les tokens sont épuisés, pause de 15 minutes...")
                time.sleep(15 * 60)
                token_index = 0
            client = get_client(tokens[token_index])
            next_token = None  # Recommencer pagination avec nouveau token
        
        except Exception as e:
            print(f"Erreur inattendue : {e}")
            break
    
    return tweets_data

# Utilisation
query = "(#beauty OR #fashion OR #tech OR #food) -is:retweet lang:en"
tweets = search_tweets_with_pagination(BEARER_TOKENS, query, max_results=10, max_pages=5)

df_tweets = pd.DataFrame(tweets)
print(df_tweets)




Erreur inattendue : 401 Unauthorized
Unauthorized
Empty DataFrame
Columns: []
Index: []


In [3]:
import tweepy
from dotenv import load_dotenv
import os

load_dotenv()

BEARER_TOKEN = os.getenv("BEARER_TOKEN_1")
print("Token chargé :", BEARER_TOKEN)

client = tweepy.Client(bearer_token=BEARER_TOKEN)

try:
    tweets = client.search_recent_tweets(query="hello", max_results=5)
    for tweet in tweets.data:
        print(tweet.text)
except Exception as e:
    print("Erreur:", e)


Token chargé : None
Erreur: 401 Unauthorized
Unauthorized


In [2]:
# Analyse des hashtags (sans les doublons)
hashtags = []
for text in df_twitter["text"]:
    hashtags.extend(re.findall(r"#\w+", text.lower()))

Counter(hashtags).most_common(10)


NameError: name 'df_twitter' is not defined