In [1]:
import pandas as pd
import re
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyodbc


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df1 = pd.read_csv('tweets_1.csv') 
df1["Text"] = df1["Text"].astype(str)
df1.head()

Unnamed: 0,Tweet_count,Username,Text,Created At,Retweets,Likes
0,1,England,Our fans. Our players. Our summer.\n\nThis is ...,Thu Jun 06 18:00:20 +0000 2024,4543,26633
1,2,Tommy Robinson 🇬🇧,Man dies at Yorkshire dales Hotspot at the wee...,Tue Aug 13 14:55:10 +0000 2024,1391,9769
2,3,England Extra,For the people mocking Lee Carsley being a pos...,Tue Jul 16 17:25:57 +0000 2024,433,4542
3,4,CentreGoals.,🚨🚨| Manchester United are monitoring Eberechi ...,Tue Sep 10 11:24:17 +0000 2024,80,1817
4,5,England,Ready for a big summer! 📸\n\n#ThreeLions | @ma...,Mon Jun 10 20:24:12 +0000 2024,2011,22827


In [4]:
df1['Created At'] = pd.to_datetime(df1['Created At'], format='%a %b %d %H:%M:%S %z %Y')
df1['Created At'] = df1['Created At'].dt.strftime("%d-%m-%Y %H:%M:%S")
df1 = df1.drop(columns=["Tweet_count" ,"Username", "Retweets", "Likes"])
df1.rename(columns={'Text': 'tweet_content', 'Created At':'tweet_date'}, inplace=True)

In [9]:
df1.head()

Unnamed: 0,tweet_content,tweet_date
0,Our fans. Our players. Our summer.\n\nThis is ...,06-06-2024 18:00:20
1,Man dies at Yorkshire dales Hotspot at the wee...,13-08-2024 14:55:10
2,For the people mocking Lee Carsley being a pos...,16-07-2024 17:25:57
3,🚨🚨| Manchester United are monitoring Eberechi ...,10-09-2024 11:24:17
4,Ready for a big summer! 📸\n\n#ThreeLions | @ma...,10-06-2024 20:24:12


In [5]:
# data cleaning and prep
def data_cleaning_and_preparation(df, text):
    # 1. Drop rows with missing values in text columns
    df.dropna(subset=[text], inplace=True)

    # 2. Remove duplicates
    df.drop_duplicates(subset=[text], inplace=True)

    # 3. Convert text to lowercase
    df[text] = df[text].str.lower()

    # 4. Remove special characters, punctuation, and numimbers
    df[text] = df[text].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

    # 5. Remove extra whitespace
    df[text] = df[text].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

    # 7. Check the cleaned data
    print(df.head())
    return df
    # Save the cleaned text data
    # df.to_csv('cleaned_text_data.csv', index=False)


In [6]:
df1 = data_cleaning_and_preparation(df1, text='tweet_content')


                                       tweet_content           tweet_date
0  our fans our players our summer this is your t...  06-06-2024 18:00:20
1  man dies at yorkshire dales hotspot at the wee...  13-08-2024 14:55:10
2  for the people mocking lee carsley being a pos...  16-07-2024 17:25:57
3  manchester united are monitoring eberechi eze ...  10-09-2024 11:24:17
4  ready for a big summer threelions marksandspen...  10-06-2024 20:24:12


In [7]:
def sentiment_analysis_transformers(df, text):
    
    model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
    analyser = pipeline("sentiment-analysis", model=model_path)
    df['scores'] = df[text].apply(lambda text: analyser (text))
    df['Sentiment'] = df['scores'].apply(lambda output: output[0]['label']) 
    df['sentiment_score'] = df['scores'].apply(lambda output: output[0]['score']) 
    df['Sentiment'] = df['Sentiment'].apply(lambda x: 'Positive ❤️' if x == 'positive' else ('Negative 😡' if x == 'negative' else 'Neutral 💛')) 
    
    df =  df.drop(columns = ["scores"])
    return df

In [8]:
sentiment_result = sentiment_analysis_transformers(df1, text='tweet_content')



In [11]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk

nltk.download('punkt')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-base-tag-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-base-tag-generation")

def hashtag_generator(df, text_column):
    # Function to generate hashtags for a single text
    def generate_tags(text):
        if not isinstance(text, str) or len(text.strip()) == 0:
            return []  # Return an empty list for empty or missing text
        
        # Tokenize and generate hashtags using the model
        inputs = tokenizer([text], max_length=512, truncation=True, return_tensors="pt")
        output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        tags = list(set(decoded_output.strip().split(", ")))  # Get unique hashtags
        return tags
    
    # Apply the generate_tags function to each row of the text column
    df['hashtags'] = df[text_column].apply(generate_tags)
    
    return df

# Generate hashtags for the 'text' column
df_with_hashtags = hashtag_generator(df1, text_column='tweet_content')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\beaut\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                         tweet_content           tweet_date  \
0    our fans our players our summer this is your t...  06-06-2024 18:00:20   
1    man dies at yorkshire dales hotspot at the wee...  13-08-2024 14:55:10   
2    for the people mocking lee carsley being a pos...  16-07-2024 17:25:57   
3    manchester united are monitoring eberechi eze ...  10-09-2024 11:24:17   
4    ready for a big summer threelions marksandspen...  10-06-2024 20:24:12   
..                                                 ...                  ...   
352  it all wasnt very well thought through though ...  08-09-2024 20:20:17   
353  totally forgot the england women are playing i...  07-09-2024 09:31:50   
354  england eyeing their first perfect test summer...  05-09-2024 21:10:52   
355  tms podcast englands poorest day of the summer...  07-09-2024 19:45:30   
356  rugby should be a summer sport in england alre...  07-09-2024 12:05:49   

                                                sco

In [13]:
df_with_hashtags.head()

Unnamed: 0,tweet_content,tweet_date,scores,Sentiment,hashtags
0,our fans our players our summer this is your t...,06-06-2024 18:00:20,"[{'label': 'neutral', 'score': 0.8428390026092...",Neutral 💛,"[Football, Europe, World]"
1,man dies at yorkshire dales hotspot at the wee...,13-08-2024 14:55:10,"[{'label': 'negative', 'score': 0.910675406455...",Negative 😡,"[Death, Coronavirus, Society, Virus]"
2,for the people mocking lee carsley being a pos...,16-07-2024 17:25:57,"[{'label': 'positive', 'score': 0.707705795764...",Positive ❤️,"[Defi, Football, Football Hall Of Fame, Hall O..."
3,manchester united are monitoring eberechi eze ...,10-09-2024 11:24:17,"[{'label': 'positive', 'score': 0.601078152656...",Positive ❤️,"[Defi, World, Soccer, United, United States]"
4,ready for a big summer threelions marksandspen...,10-06-2024 20:24:12,"[{'label': 'neutral', 'score': 0.6132248044013...",Neutral 💛,"[Summer, Summer Training, Training, Campfire]"


In [14]:
df_with_hashtags['sentiment_score'] = df_with_hashtags['scores'].apply(lambda output: output[0]['score']) 
df_with_hashtags =  df_with_hashtags.drop(columns = ["scores"])

In [15]:
df_with_hashtags.head()

Unnamed: 0,tweet_content,tweet_date,Sentiment,hashtags,sentiment_score
0,our fans our players our summer this is your t...,06-06-2024 18:00:20,Neutral 💛,"[Football, Europe, World]",0.842839
1,man dies at yorkshire dales hotspot at the wee...,13-08-2024 14:55:10,Negative 😡,"[Death, Coronavirus, Society, Virus]",0.910675
2,for the people mocking lee carsley being a pos...,16-07-2024 17:25:57,Positive ❤️,"[Defi, Football, Football Hall Of Fame, Hall O...",0.707706
3,manchester united are monitoring eberechi eze ...,10-09-2024 11:24:17,Positive ❤️,"[Defi, World, Soccer, United, United States]",0.601078
4,ready for a big summer threelions marksandspen...,10-06-2024 20:24:12,Neutral 💛,"[Summer, Summer Training, Training, Campfire]",0.613225


In [19]:
df_with_hashtags.to_csv('result.csv', index=False)

In [23]:
df_with_hashtags.to_excel('result.xlsx' , index=False)


In [None]:
# hashtag gen
# topic classififcation
# data cleaning and enreichement: ok
# define the terms of the projet ie
# sentiment analysis, 
# ETL, pipeline, real time, batch,
# diagrams, usecase, class, package,sequence


# SELECT TOP (1000) [tweet_id]
#       ,[tweet_content]
#       ,[tweet_date]
#       ,[tweet_location]
#       ,[hashtag]
#       ,[sentiment_id]
#   FROM [tweets].[dbo].[tweet]



# SELECT TOP (1000) [sentiment_id]
#       ,[sentiment]
#       ,[sentiment_score]
#   FROM [tweets].[dbo].[sentiment]