In [None]:
#goals
#analyze thread attributs & engagements
#data: https://www.kaggle.com/datasets/danielgrijalvas/twitter-threads 

In [3]:
import pandas as pd

# Adjusted function to examine influence of thread_number and timestamp on engagement
def examine_engagement(dataframe):
    # Convert the 'timestamp' column to datetime
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'])
    
    # Group by 'thread_number' and date part of 'timestamp', then calculate mean engagement metrics
    grouped = dataframe.groupby(['thread_number']).agg({
        'retweets': ['mean', 'median'],
        'likes': ['mean', 'median'],
        'replies': ['mean', 'median']
    }).reset_index()
    
    return grouped

# Corrected file path to match the uploaded file
thread2025 = "C:/Users/Jackd\Downloads/archive (1)/twenty_twentyfive.csv"
df2025 = pd.read_csv(thread2025, encoding='ISO-8859-1')
engagement_summary_2025 = examine_engagement(df2025)

thread2530 = "C:/Users/Jackd/Downloads/archive (1)/twentyfive_thirty.csv"
df2530 = pd.read_csv(thread2530, encoding='ISO-8859-1')
engagement_summary_2530 = examine_engagement(df2530)

thread1520 = "C:/Users/Jackd/Downloads/archive (1)/fifteen_twenty.csv"
df1520= pd.read_csv(thread1520, encoding='ISO-8859-1')
engagement_summary_1520 = examine_engagement(df1520)

thread0510 = "C:/Users/Jackd/Downloads/archive (1)/five_ten.csv"
df0510 = pd.read_csv(thread1520, encoding='ISO-8859-1')
engagement_summary_0510 = examine_engagement(df0510)

thread1015 = "C:/Users/Jackd/Downloads/archive (1)/ten_fifteen.csv"
df1015 = pd.read_csv(thread1520, encoding='ISO-8859-1')
engagement_summary_1015 = examine_engagement(df1015)


print(engagement_summary_2025)
print(engagement_summary_2530)
print(engagement_summary_1520)
print(engagement_summary_0510)
print(engagement_summary_1015)



   thread_number     retweets                likes            replies       
                         mean  median         mean  median       mean median
0       Thread 1    11.652174    10.0    25.782609    21.0   2.608696    2.0
1      Thread 10    28.666667    21.5    20.125000    14.0   1.958333    2.0
2     Thread 100   180.285714   113.0   589.095238   396.0  21.476190   14.0
3      Thread 11    23.666667    21.0    23.761905    19.0   5.428571    3.0
4      Thread 12    15.478261     1.0    36.000000     5.0   1.521739    1.0
..           ...          ...     ...          ...     ...        ...    ...
95     Thread 95     1.250000     0.0    13.291667    11.0   1.625000    1.0
96     Thread 96   190.916667   163.5   352.958333   326.0   9.041667    3.5
97     Thread 97  2162.619048  1077.0  5495.142857  3498.0  58.571429    0.0
98     Thread 98   446.043478   269.0  1347.565217  1069.0  11.086957    0.0
99     Thread 99   119.454545    80.0   688.000000   592.5  27.227273   11.0

In [18]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') 

# Custom stopwords setup
custom_stop_words = stopwords.words('english') + ['https', 'amp', 'like', 'know', 'll', 've', 'don', 'trump']

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in text.lower().split() if word.isalpha() and word not in custom_stop_words]
    return ' '.join(words)

def display_topics(model, feature_names, no_top_words):
    topic_descriptions = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        topic_descriptions[topic_idx] = top_words
        print("Topic {}: {}".format(topic_idx, top_words))
    return topic_descriptions

def process_thread(file_path, n_topics=5, no_top_words=10):
    print(f"Loading data from {file_path}")
    df = pd.read_csv(file_path, encoding='ISO-8859-1')

    # Check if 'text' column exists
    if 'text' not in df.columns:
        print(f"No 'text' column in {file_path}")
        return None
    
    # Preprocess the text data
    df['text'] = df['text'].apply(preprocess_text)

    # Text vectorization with TF-IDF
    vect = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
    dtm = vect.fit_transform(df['text'])

    # Apply LDA for topic modeling
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(dtm)

    # Display top words for each topic
    topic_descriptions = display_topics(lda, vect.get_feature_names_out(), no_top_words)

    # Assign the dominant topic to each document
    df['Topic'] = lda.transform(dtm).argmax(axis=1)

    # Convert 'timestamp' to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Group by Topic and calculate engagement metrics
    grouped = df.groupby('Topic').agg({
        'retweets': ['mean', 'median'],
        'likes': ['mean', 'median'],
        'replies': ['mean', 'median']
    }).reset_index()

    # Flatten multi-index columns for easier access
    grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]

    return grouped, topic_descriptions

# Define paths to the CSV files and process each thread
threads = {
    "2025": "C:/Users/Jackd/Downloads/archive (1)/twenty_twentyfive.csv",
    "2530": "C:/Users/Jackd/Downloads/archive (1)/twentyfive_thirty.csv",
    "1520": "C:/Users/Jackd/Downloads/archive (1)/fifteen_twenty.csv",
    "0510": "C:/Users/Jackd/Downloads/archive (1)/five_ten.csv",
    "1015": "C:/Users/Jackd/Downloads/archive (1)/ten_fifteen.csv"
}

for thread_id, file_path in threads.items():
    result = process_thread(file_path)
    if result:
        engagement_summary, topics = result
        print(f"\nEngagement Summary for Thread {thread_id}:")
        print(engagement_summary)
        print("\nTopic Descriptions:")
        for idx, desc in topics.items():
            print(f"Topic {idx}: {desc}")
    else:
        print(f"Skipped {thread_id} due to missing 'text' column or other issues.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jackd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jackd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jackd\AppData\Roaming\nltk_data...


Loading data from C:/Users/Jackd/Downloads/archive (1)/twenty_twentyfive.csv
Topic 0: democratic candidate vote way time district gop new national thing
Topic 1: post think state thread criminal people end money want mueller
Topic 2: unroll comment anon related current drop say politico demand el
Topic 3: say work asked mean answer tell witness year fact president
Topic 4: le people social right la incumbent medium bio actually look

Engagement Summary for Thread 2025:
   Topic_  retweets_mean  retweets_median  likes_mean  likes_median  \
0       0     117.688663             24.0  309.847716          63.0   
1       1     149.932367             36.0  383.768116         102.0   
2       2     146.647558             33.0  384.762208         106.0   
3       3     123.437799             32.0  350.495215         109.0   
4       4     169.139303             30.0  400.703980          95.5   

   replies_mean  replies_median  
0      6.805415             2.0  
1      7.741546             2.0