**Data Scrapping using tweepy**

In [2]:
pip install tweepy




In [3]:
import tweepy

# API credentials (replace with your own)
API_KEY = "XXX"
API_SECRET = "XXX"
ACCESS_TOKEN = "XXX"
ACCESS_SECRET = "XXX"
BEARER_TOKEN = "XXX"

# Authenticate with the API
client = tweepy.Client(bearer_token=BEARER_TOKEN)

# Test the connection
try:
    user = client.get_user(username="TwitterDev")
    print(f"Authenticated! Username: {user.data.name}")
except Exception as e:
    print("Error:", e)


Error: 'NoneType' object has no attribute 'name'


In [4]:
import csv

# Keywords to search for
keywords = ['program makan gratis', 'makan gratis', 'makan bergizi gratis']
query = ' OR '.join([f'"{keyword}"' for keyword in keywords]) + ' lang:id'  # Combine keywords with OR operator

# Search for tweets
try:
    tweets = client.search_recent_tweets(query=query, max_results=100)  # You can increase max_results up to 100

    # Save results to CSV
    save_path = 'scrapped_tweets.csv'
    with open(save_path, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Tweet ID", "Text"])  # Column headers

        for tweet in tweets.data:
            writer.writerow([tweet.id, tweet.text])  # Write tweet ID and text

    print("Tweets saved to scrapped_tweets.csv!")

except Exception as e:
    print("Error:", e)


Tweets saved to scrapped_tweets.csv!


**Data Cleaning**

In [1]:
import pandas as pd

tweets = pd.read_csv('scrapped_tweets.csv', delimiter=',')
print(tweets.head())

              Tweet ID                                               Text
0  1880816057373675642  Mari Dukung dan Sukseskan Program Makan Bergiz...
1  1880816034715935156  RT @NewTread_: Jadi ini video yang buat kontro...
2  1880816031314444424  Mari Dukung dan Sukseskan Program Makan Bergiz...
3  1880816004231839969  Mari Dukung dan Sukseskan Program Makan Bergiz...
4  1880815973571399894  RT @anasurbaningrum: Ada anak komentar soal ra...


In [9]:
# Load custom stopwords from CSV
stopwords_df = pd.read_csv("stopword.csv")
custom_stopwords = stopwords_df.index.tolist() # Use index to get the stopwords

In [10]:
import nltk
import re
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')  # Download tokenizer

def clean_and_remove_stopwords(text, stopwords):
    # Remove special characters and convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r"^RT[\s]+", "", text)
    text = re.sub(r'\@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()

    # Tokenize and remove stopwords
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stopwords]
    return " ".join(filtered_words)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
tweets["Cleaned_Tweet"] = tweets["Text"].apply(
    lambda x: clean_and_remove_stopwords(x, custom_stopwords)
)

tweets.to_csv("cleaned_scrapped_tweets.csv", index=False)

In [12]:
cleaned_tweets = pd.read_csv('cleaned_scrapped_tweets.csv', delimiter=',')
print(cleaned_tweets.head())

              Tweet ID                                               Text  \
0  1880816057373675642  Mari Dukung dan Sukseskan Program Makan Bergiz...   
1  1880816034715935156  RT @NewTread_: Jadi ini video yang buat kontro...   
2  1880816031314444424  Mari Dukung dan Sukseskan Program Makan Bergiz...   
3  1880816004231839969  Mari Dukung dan Sukseskan Program Makan Bergiz...   
4  1880815973571399894  RT @anasurbaningrum: Ada anak komentar soal ra...   

                                       Cleaned_Tweet  
0  mari dukung dan sukseskan program makan bergiz...  
1  jadi ini video yang buat kontroversi makan ber...  
2  mari dukung dan sukseskan program makan bergiz...  
3  mari dukung dan sukseskan program makan bergiz...  
4  ada anak komentar soal rasa menu makan gratis ...  
