In [1]:
import pandas as pd

twitter_df = pd.read_csv('tweets_data4.csv', encoding='utf-8')

display(twitter_df)

Unnamed: 0,Keyword,Tweet Text
0,buurthub OR deelvervoer OR deelauto OR deelfie...,@PekePeter @Nieuwsblad_be @groen Dan is het we...
1,buurthub OR deelvervoer OR deelauto OR deelfie...,Wéér trekt een aanbieder van deelvervoer zich ...
2,buurthub OR deelvervoer OR deelauto OR deelfie...,#Tinyhouses #Westpark #Groningen Er gebeurt hi...
3,buurthub OR deelvervoer OR deelauto OR deelfie...,Autodelen of een deelauto. Ideaal te combinere...
4,buurthub OR deelvervoer OR deelauto OR deelfie...,Vandaag is tevens de start van de landelijke c...
...,...,...
1035,buurthub OR deelvervoer OR deelauto OR deelfie...,@TNYBN123 @z0roProfit Een deelauto wordt door ...
1036,buurthub OR deelvervoer OR deelauto OR deelfie...,Deelfiets in brand bij bushalte in Gorinchem: ...
1037,buurthub OR deelvervoer OR deelauto OR deelfie...,Deelfiets in brand bij bushalte in Gorinchem: ...
1038,buurthub OR deelvervoer OR deelauto OR deelfie...,Nieuws: Deelfiets in brand bij bushalte in Gor...


In [2]:
from deep_translator import GoogleTranslator

twitter_df['Translated Text'] = twitter_df['Tweet Text'].apply(
    lambda x: GoogleTranslator(source='auto', target='en').translate(x)
)

In [3]:
display(twitter_df['Translated Text'])

0       @PekePeter @Nieuwsblad_be @groen Then it is st...
1       Once again a shared transport provider is with...
2       #Tinyhouses #Westpark #Groningen There is much...
3       Car sharing or a shared car. Ideal to combine ...
4       Today is also the start of the national campai...
                              ...                        
1035    @TNYBN123 @z0roProfit A shared car is used by ...
1036    Shared bicycle on fire at bus stop in Gorinche...
1037    Shared bicycle on fire at bus stop in Gorinche...
1038    News: Bicycle sharing on fire at bus stop in G...
1039    This morning I parked my shared bicycle powere...
Name: Translated Text, Length: 1040, dtype: object

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from gensim import corpora
from gensim.models.ldamodel import LdaModel

english_words = set(words.words())

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # YOUR CODE HERE
    text = str(text) if isinstance(text, (str, float)) else ''

    # Remove Twitter handles
    text = re.sub(r'@[\w_]+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    #remove special characters
    text= re.sub(r'[^A-Za-z\s]', '', text)

    #case normalisation
    text = text.lower()

    #tokenisation
    tokens = word_tokenize(text)

    #removal of stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token in english_words]
        
    #lemmatisation
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens


texts = twitter_df['Translated Text'].tolist()

processed_texts = [preprocess_text(text) for text in texts]

corpus = corpora.Dictionary(processed_texts)

doc_term_matrix = [corpus.doc2bow(text) for text in processed_texts]

print(processed_texts)




In [5]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

def analyze_topics(texts):
    # Concatenate the lists of tweets into a single string for each document
    concatenated_texts = [' '.join(tweet_list) for tweet_list in texts]

    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Create BERTopic model
    model = BERTopic(verbose=True, embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size=5, ctfidf_model=ctfidf_model)

    # Fit and transform the model on the concatenated texts
    topics, _ = model.fit_transform(concatenated_texts)

    # Get topic frequency information
    freq = model.get_topic_info()

    
    print("Number of topics: {}".format(len(freq)))
    display(freq.head(50))

    freq = model.get_topic_info()
    freq.to_csv('output2.csv', index=False)

    return model, topics

# Assuming processed_texts is a list of lists containing content from different tweets
model, topics = analyze_topics(processed_texts)


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2023-12-11 18:31:11,656 - BERTopic - Transformed documents to Embeddings
2023-12-11 18:31:22,788 - BERTopic - Reduced dimensionality
2023-12-11 18:31:22,865 - BERTopic - Clustered reduced embeddings


Number of topics: 38


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,309,-1_would_dont_take_drive,"[would, dont, take, drive, still, also, use, w...","[buy car car le, better buy car car interestin..."
1,0,84,0_scooter_bench_hit_delft,"[scooter, bench, hit, delft, sidewalk, blind, ...","[scooter go district delft, back scooter, scoo..."
2,1,75,1_public_transport_become_get,"[public, transport, become, get, future, west,...",[affordable reliable public transport electric...
3,2,52,2_bicycle_bike_fire_water,"[bicycle, bike, fire, water, velo, injured, br...","[bike, bike, fire brigade bicycle water]"
4,3,44,3_like_variant_sawing_preferably,"[like, variant, sawing, preferably, taxi, stil...","[car, car, car]"
5,4,43,4_hub_neighborhood_district_consultation,"[hub, neighborhood, district, consultation, en...",[consultation supervision enforcement another ...
6,5,38,5_parking_space_permit_fixed,"[parking, space, permit, fixed, moment, privat...",[plenty available several excellent alternativ...
7,6,28,6_premium_electric_interest_becoming,"[premium, electric, interest, becoming, expect...",[already possible rent electric may also possi...
8,7,27,7_rental_rent_ownership_lease,"[rental, rent, ownership, lease, pay, even, us...","[car end car ownership, thats right combinatio..."
9,8,26,8_electric_opt_already_invest,"[electric, opt, already, invest, might, increa...","[without car opt car already electric, electri..."


In [7]:
topics_df = pd.read_csv('output2.csv', sep=',')
display(topics_df)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,309,-1_would_dont_take_drive,"['would', 'dont', 'take', 'drive', 'still', 'a...","['buy car car le', 'better buy car car interes..."
1,0,84,0_scooter_bench_hit_delft,"['scooter', 'bench', 'hit', 'delft', 'sidewalk...","['scooter go district delft', 'back scooter', ..."
2,1,75,1_public_transport_become_get,"['public', 'transport', 'become', 'get', 'futu...",['affordable reliable public transport electri...
3,2,52,2_bicycle_bike_fire_water,"['bicycle', 'bike', 'fire', 'water', 'velo', '...","['bike', 'bike', 'fire brigade bicycle water']"
4,3,44,3_like_variant_sawing_preferably,"['like', 'variant', 'sawing', 'preferably', 't...","['car', 'car', 'car']"
5,4,43,4_hub_neighborhood_district_consultation,"['hub', 'neighborhood', 'district', 'consultat...",['consultation supervision enforcement another...
6,5,38,5_parking_space_permit_fixed,"['parking', 'space', 'permit', 'fixed', 'momen...",['plenty available several excellent alternati...
7,6,28,6_premium_electric_interest_becoming,"['premium', 'electric', 'interest', 'becoming'...",['already possible rent electric may also poss...
8,7,27,7_rental_rent_ownership_lease,"['rental', 'rent', 'ownership', 'lease', 'pay'...","['car end car ownership', 'thats right combina..."
9,8,26,8_electric_opt_already_invest,"['electric', 'opt', 'already', 'invest', 'migh...","['without car opt car already electric', 'elec..."


In [8]:
import requests

def analyze_sentiment(text):
    url = "http://text-processing.com/api/sentiment/"
    data = {'text': text, 'language': 'dutch'}

    response = requests.post(url, data=data)
    result = response.json()

    sentiment = result['label']
    confidence = result['probability'][sentiment]

    return sentiment, confidence


topics_df['sentiment'], topics_df['confidence'] = zip(*topics_df['Representative_Docs'].apply(analyze_sentiment))

print(topics_df[['Representative_Docs', 'sentiment', 'confidence']])

                                  Representative_Docs sentiment  confidence
0   ['buy car car le', 'better buy car car interes...   neutral        1.00
1   ['scooter go district delft', 'back scooter', ...   neutral        1.00
2   ['affordable reliable public transport electri...   neutral        1.00
3      ['bike', 'bike', 'fire brigade bicycle water']   neutral        1.00
4                               ['car', 'car', 'car']   neutral        1.00
5   ['consultation supervision enforcement another...   neutral        1.00
6   ['plenty available several excellent alternati...   neutral        1.00
7   ['already possible rent electric may also poss...   neutral        1.00
8   ['car end car ownership', 'thats right combina...   neutral        1.00
9   ['without car opt car already electric', 'elec...   neutral        1.00
10  ['know transfer hub current learn pay transfer...   neutral        1.00
11  ['time follow mobility people make car researc...       pos        0.70
12  ['intere