In [45]:
import pandas as pd

twitter_df = pd.read_csv('tweets_data4.csv', encoding='utf-8')

display(twitter_df)

Unnamed: 0,Keyword,Tweet Text
0,buurthub OR deelvervoer OR deelauto OR deelfie...,@PekePeter @Nieuwsblad_be @groen Dan is het we...
1,buurthub OR deelvervoer OR deelauto OR deelfie...,Wéér trekt een aanbieder van deelvervoer zich ...
2,buurthub OR deelvervoer OR deelauto OR deelfie...,#Tinyhouses #Westpark #Groningen Er gebeurt hi...
3,buurthub OR deelvervoer OR deelauto OR deelfie...,Autodelen of een deelauto. Ideaal te combinere...
4,buurthub OR deelvervoer OR deelauto OR deelfie...,Vandaag is tevens de start van de landelijke c...
...,...,...
1035,buurthub OR deelvervoer OR deelauto OR deelfie...,@TNYBN123 @z0roProfit Een deelauto wordt door ...
1036,buurthub OR deelvervoer OR deelauto OR deelfie...,Deelfiets in brand bij bushalte in Gorinchem: ...
1037,buurthub OR deelvervoer OR deelauto OR deelfie...,Deelfiets in brand bij bushalte in Gorinchem: ...
1038,buurthub OR deelvervoer OR deelauto OR deelfie...,Nieuws: Deelfiets in brand bij bushalte in Gor...


In [48]:
from deep_translator import GoogleTranslator

twitter_df['Translated Text'] = twitter_df['Tweet Text'].apply(
    lambda x: GoogleTranslator(source='auto', target='en').translate(x)
)

KeyboardInterrupt: 

In [49]:
display(twitter_df['Translated Text'])

0       @PekePeter @Nieuwsblad_be @groen Then it is st...
1       Once again a shared transport provider is with...
2       #Tinyhouses #Westpark #Groningen There is much...
3       Car sharing or a shared car. Ideal to combine ...
4       Today is also the start of the national campai...
                              ...                        
1035    @TNYBN123 @z0roProfit A shared car is used by ...
1036    Shared bicycle on fire at bus stop in Gorinche...
1037    Shared bicycle on fire at bus stop in Gorinche...
1038    News: Bicycle sharing on fire at bus stop in G...
1039    This morning I parked my shared bicycle powere...
Name: Translated Text, Length: 1040, dtype: object

In [50]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from gensim import corpora
from gensim.models.ldamodel import LdaModel

english_words = set(words.words())

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # YOUR CODE HERE
    text = str(text) if isinstance(text, (str, float)) else ''

    # Remove Twitter handles
    text = re.sub(r'@[\w_]+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    #remove special characters
    text= re.sub(r'[^A-Za-z\s]', '', text)

    #case normalisation
    text = text.lower()

    #tokenisation
    tokens = word_tokenize(text)

    #removal of stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token in english_words]
        
    #lemmatisation
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens


texts = twitter_df['Translated Text'].tolist()

processed_texts = [preprocess_text(text) for text in texts]

corpus = corpora.Dictionary(processed_texts)

doc_term_matrix = [corpus.doc2bow(text) for text in processed_texts]

print(processed_texts)




In [55]:
num_topics = 15

lda_model =  LdaModel(corpus=doc_term_matrix, id2word=corpus, num_topics=num_topics, passes=10, random_state=42)

In [56]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

print(lda_model.log_perplexity(doc_term_matrix))

(0, '0.038*"car" + 0.016*"transport" + 0.015*"work" + 0.012*"already" + 0.011*"neighborhood" + 0.011*"many" + 0.011*"bicycle" + 0.010*"good" + 0.009*"time" + 0.008*"parking"')
(1, '0.052*"car" + 0.020*"scooter" + 0.019*"also" + 0.013*"use" + 0.012*"every" + 0.009*"day" + 0.008*"get" + 0.007*"used" + 0.007*"longer" + 0.007*"higher"')
(2, '0.047*"car" + 0.030*"transport" + 0.017*"come" + 0.011*"share" + 0.010*"research" + 0.009*"electric" + 0.009*"sustainable" + 0.009*"bicycle" + 0.009*"public" + 0.009*"new"')
(3, '0.066*"car" + 0.028*"parking" + 0.022*"transport" + 0.017*"space" + 0.013*"city" + 0.011*"also" + 0.011*"public" + 0.010*"make" + 0.009*"drive" + 0.009*"good"')
(4, '0.024*"car" + 0.023*"electric" + 0.013*"ride" + 0.013*"drive" + 0.011*"first" + 0.010*"consultation" + 0.010*"enforcement" + 0.009*"hub" + 0.009*"neighborhood" + 0.008*"go"')
(5, '0.102*"car" + 0.031*"electric" + 0.016*"transport" + 0.012*"also" + 0.012*"via" + 0.011*"one" + 0.011*"work" + 0.010*"battery" + 0.010*