In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import random

# Initialize Sentiment Analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Set up Selenium WebDriver
driver = webdriver.Edge()  
driver.get("https://x.com/login")  # Open Twitter login page
time.sleep(5)  # Wait for page to load

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Surbhi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [1]:
from dotenv import load_dotenv
import os
load_dotenv()  # Load .env variables

username = os.getenv("TWITTER_USERNAME")
password = os.getenv("TWITTER_PASSWORD")

In [15]:
# Enter username
username_input = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, '//input[@name="text"]'))
)
username_input.send_keys(username)
username_input.send_keys(Keys.RETURN)
time.sleep(3)

# Wait and enter password
password_input = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, '//input[@name="password"]'))
)
password_input.send_keys(password)
password_input.send_keys(Keys.RETURN)
time.sleep(5)

In [16]:
# Helper: Extract location by visiting profile
def get_user_location(username):
    try:
        profile_url = f"https://x.com/{username}"
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(profile_url)
        time.sleep(random.uniform(2, 4))

        # Try getting location by scanning the profile's bio section
        spans = driver.find_elements(By.XPATH, '//div[@data-testid="UserProfileHeader_Items"]/span')
        for span in spans:
            txt = span.text.strip()
            if txt and not txt.startswith("@") and not txt.startswith("Joined"):
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                return txt

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return "Unknown"
    except:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return "Unknown"

# Setup
brands = ["Samsung", "iPhone"]
all_tweets = []
TWEETS_PER_BRAND = 100  # Reduce for testing, can increase
all_hashtags = []
all_keywords = []
visited_users = {}

for brand in brands:
    print(f"🔄 Searching for tweets about {brand}...")

    search_url = f"https://x.com/search?q={brand}&src=typed_query&f=live"
    driver.get(search_url)
    time.sleep(5)

    tweet_data = []
    scroll_attempts = 0
    max_scrolls = 30

    while len(tweet_data) < TWEETS_PER_BRAND and scroll_attempts < max_scrolls:
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(random.uniform(2, 4))

        tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]')

        for tweet in tweets:
            if len(tweet_data) >= TWEETS_PER_BRAND:
                break

            tweet_info = {"Brand": brand}

            try:
                tweet_text = tweet.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
                tweet_info["Tweet"] = tweet_text
            except:
                tweet_info["Tweet"] = "Not Found"

            tweet_info["Sentiment Score"] = sia.polarity_scores(tweet_info["Tweet"])["compound"]
            tweet_info["Sentiment"] = (
                "Positive" if tweet_info["Sentiment Score"] > 0
                else "Negative" if tweet_info["Sentiment Score"] < 0
                else "Neutral"
            )

            try:
                likes = tweet.find_element(By.XPATH, './/button[@data-testid="like"]//span').text
                tweet_info["Likes"] = likes.replace("K", "000").replace("M", "000000")
            except:
                tweet_info["Likes"] = "0"

            try:
                retweets = tweet.find_element(By.XPATH, './/button[@data-testid="retweet"]//span').text
                tweet_info["Retweets"] = retweets.replace("K", "000").replace("M", "000000")
            except:
                tweet_info["Retweets"] = "0"

            try:
                tweet_info["Time"] = tweet.find_element(By.XPATH, './/time').get_attribute("datetime")
            except:
                tweet_info["Time"] = "Not Available"

            hashtags = [word for word in tweet_info["Tweet"].split() if word.startswith("#")]
            mentions = [word for word in tweet_info["Tweet"].split() if word.startswith("@")]
            tweet_info["Hashtags"] = ", ".join(hashtags)
            tweet_info["Mentions"] = ", ".join(mentions)
            all_hashtags.extend(hashtags)
            all_keywords.extend(tweet_info["Tweet"].split())

            try:
                handle_elem = tweet.find_element(By.XPATH, './/a[contains(@href, "/status/")]')
                handle_url = handle_elem.get_attribute("href")
                username = handle_url.split("/")[3]  # https://x.com/{username}/status/...
                tweet_info["Username"] = username
            except:
                tweet_info["Username"] = "Unknown"
                username = "Unknown"

            try:
                img = tweet.find_element(By.XPATH, './/img[contains(@alt, "Image")]')
                tweet_info["Profile Image"] = img.get_attribute("src")
            except:
                tweet_info["Profile Image"] = "Not Available"

            try:
                media = tweet.find_element(By.XPATH, './/img[contains(@src, "twimg")]')
                tweet_info["Media"] = media.get_attribute("src")
            except:
                tweet_info["Media"] = "None"

            try:
                device_element = tweet.find_elements(By.XPATH, './/span[contains(@class, "css-901oao")]')
                for el in device_element:
                    text = el.text
                    if "Twitter for" in text:
                        tweet_info["Device"] = text
                        break
                else:
                    tweet_info["Device"] = "Unknown"
            except:
                tweet_info["Device"] = "Unknown"

            # Get location from profile only if username is known
            if username != "Unknown":
                if username in visited_users:
                    tweet_info["Location"] = visited_users[username]
                else:
                    loc = get_user_location(username)
                    visited_users[username] = loc
                    tweet_info["Location"] = loc
            else:
                tweet_info["Location"] = "Unknown"

            tweet_data.append(tweet_info)

        print(f"📝 {len(tweet_data)} tweets loaded for {brand}...")
        scroll_attempts += 1

    all_tweets.extend(tweet_data)

# Save data
df = pd.DataFrame(all_tweets)
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
df['Date'] = df['Time'].dt.date

🔄 Searching for tweets about Samsung...
📝 19 tweets loaded for Samsung...
📝 32 tweets loaded for Samsung...
📝 49 tweets loaded for Samsung...
📝 59 tweets loaded for Samsung...
📝 77 tweets loaded for Samsung...
📝 90 tweets loaded for Samsung...
📝 100 tweets loaded for Samsung...
🔄 Searching for tweets about iPhone...
📝 13 tweets loaded for iPhone...
📝 27 tweets loaded for iPhone...
📝 36 tweets loaded for iPhone...
📝 47 tweets loaded for iPhone...
📝 55 tweets loaded for iPhone...
📝 67 tweets loaded for iPhone...
📝 79 tweets loaded for iPhone...
📝 89 tweets loaded for iPhone...
📝 100 tweets loaded for iPhone...


In [17]:
df

Unnamed: 0,Brand,Tweet,Sentiment Score,Sentiment,Likes,Retweets,Time,Hashtags,Mentions,Username,Profile Image,Media,Device,Location,Date
0,Samsung,Je m'en fous des iPhones j'ai toujours eu des ...,0.0000,Neutral,,,2025-04-09 11:07:54+00:00,,,Oscar68686,Not Available,https://pbs.twimg.com/profile_images/190779772...,Unknown,"Hérault, Languedoc-Roussillon",2025-04-09
1,Samsung,"Mlm ini juga aku jabanin,",-0.3400,Negative,,,2025-04-09 11:07:53+00:00,,,RcSamsung1,Not Available,https://pbs.twimg.com/profile_images/190629084...,Unknown,Unknown,2025-04-09
2,Samsung,진만누나 믿어요,0.0000,Neutral,,,2025-04-09 11:07:52+00:00,,,PuDdInG5_,Not Available,https://pbs.twimg.com/profile_images/190683589...,Unknown,21,2025-04-09
3,Samsung,Samsungのリングも見てみたら5〜15号\n誰のどの指に付けること前提なんだろう？大人が...,0.0000,Neutral,,,2025-04-09 11:07:48+00:00,,,kenji_yogi,Not Available,https://pbs.twimg.com/profile_images/166714932...,Unknown,St john's co-cathedral,2025-04-09
4,Samsung,"Aku pengguna samsung dari tahun 2018, hp perta...",0.0000,Neutral,,,2025-04-09 11:07:41+00:00,,,calonkamumas,Not Available,https://pbs.twimg.com/profile_images/176090691...,Unknown,di rumah,2025-04-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,iPhone,iPhone dreigt onbetaalbaar te worden door Trum...,0.0000,Neutral,,,2025-04-09 11:11:30+00:00,#telegraafpremium,@Telegraaf,johan1970nl,Not Available,https://pbs.twimg.com/profile_images/650051636...,Unknown,"Breda, Nederland",2025-04-09
196,iPhone,アメリカ人、もうiPhoneかえんようになるな,0.0000,Neutral,,,2025-04-09 11:11:27+00:00,,,f1at,Not Available,https://pbs.twimg.com/profile_images/755025300...,Unknown,Unknown,2025-04-09
197,iPhone,Oye que cámara tan horrible del iPhone 15 pro ...,-0.5423,Negative,,,2025-04-09 11:11:27+00:00,,,Nellteg1,Not Available,https://pbs.twimg.com/profile_images/185375689...,Unknown,Unknown,2025-04-09
198,iPhone,今のうちにiPhone買っとかないと国内も中古品暴騰するんじゃないの\n転売ヤー歓喜してそう,0.0000,Neutral,,,2025-04-09 11:11:26+00:00,,,kemo3asada51345,Not Available,https://abs.twimg.com/sticky/default_profile_i...,Unknown,Unknown,2025-04-09


In [24]:
driver.quit()

In [26]:
# **Save to CSV**
df.to_csv("multi_brand_info.csv", index=False)
print(f"✅ Scraping Complete! {len(df)} tweets saved to multi_brand_tweets.csv")

✅ Scraping Complete! 200 tweets saved to multi_brand_tweets.csv


In [1]:
import pandas as pd
from langdetect import detect, LangDetectException

# Load scraped data
df = pd.read_csv("multi_brand_info.csv")

# Remove empty tweets
df = df[df["Tweet"] != "Not Found"]

# Convert numeric columns
df["Likes"] = pd.to_numeric(df["Likes"], errors="coerce").fillna(0).astype(int)
df["Retweets"] = pd.to_numeric(df["Retweets"], errors="coerce").fillna(0).astype(int)

# Safe language detection
def safe_detect(text):
    try:
        return detect(text) if text and text != "Not Found" else "unknown"
    except LangDetectException:
        return "unknown"

df["Language"] = df["Tweet"].apply(safe_detect)

# Categorize sentiment (only for English, else Neutral)
def categorize_sentiment(score, lang):
    if lang != "en":
        return "Neutral"
    if score > 0:
        return "Positive"
    elif score < 0:
        return "Negative"
    else:
        return "Neutral"

df["Sentiment"] = df.apply(lambda row: categorize_sentiment(row["Sentiment Score"], row["Language"]), axis=1)

# Add engagement metric
df["Engagement"] = df["Likes"] + df["Retweets"]

# Drop unnecessary columns
df = df.drop(columns=['Device'])

# Save cleaned data
df.to_csv("cleaned_multi_brand_info.csv", index=False)
print("✅ Data cleaned & saved as cleaned_multi_brand_info.csv (All 200 tweets kept)")

✅ Data cleaned & saved as cleaned_multi_brand_info.csv (All 200 tweets kept)


In [2]:
import pandas as pd

df = pd.read_csv("cleaned_multi_brand_info.csv")

# Clean Brand column
df['Brand'] = df['Brand'].str.strip().str.title()  # removes whitespace and capitalizes

# Optional: filter to only Samsung and iPhone if there are other brands
df = df[df['Brand'].isin(['Samsung', 'Iphone'])]

df.to_csv("cleaned_multi_brand_info.csv", index=False)
print("✅ Brand column cleaned & saved.")


✅ Brand column cleaned & saved.


In [3]:
import pandas as pd
import re
from collections import Counter

# Load your cleaned data
df = pd.read_csv("cleaned_multi_brand_info.csv")

# Define stopwords
stopwords = set([
    "the", "and", "for", "with", "that", "this", "you", "your", "are", "have",
    "has", "they", "their", "from", "what", "get", "out", "now", "its", "it's",
    "how", "more", "just", "was", "our", "about", "all"
])

# Create word frequency list with brand
rows = []

for _, row in df.iterrows():
    brand = row["Brand"]
    tweet = str(row["Tweet"]).lower()
    tweet = re.sub(r"http\S+|@\S+|#\S+|[^a-zA-Z\s]", "", tweet)  # Remove links, tags, etc.
    words = tweet.split()
    filtered = [w for w in words if len(w) > 2 and w not in stopwords]
    for word in filtered:
        rows.append((brand, word))

word_df = pd.DataFrame(rows, columns=["Brand", "Word"])
word_freq = word_df.groupby(["Brand", "Word"]).size().reset_index(name="Frequency")

# Save it
word_freq.to_excel("tweet_word_freq_by_brand.xlsx", index=False)
print("✅ Word frequency by brand saved!")


✅ Word frequency by brand saved!
