In [1]:
import pandas as pd
import re
import numpy as np
np.random.seed(42)
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
# spacy for lemmatization
import spacy
import nltk
from nltk.corpus import stopwords
import contractions
import string
import openpyxl
from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phrases, Phraser, ENGLISH_CONNECTOR_WORDS



In [3]:
path = r"C:\Users\Vojimir Ranitovic\Desktop\NON RELIGIOUS FINAL\all_tweets.csv"
df = pd.read_csv(path, header=None,encoding='utf-8',low_memory=False)
df.columns =['username', 'tweet', 'date', 'location',"likes","retweets","followers","tweet_url"]
df.head(2)

Unnamed: 0,username,tweet,date,location,likes,retweets,followers,tweet_url
0,AAAPORG,b'This Pakistani lecturer has spent six years ...,2019-12-06 10:29:17+00:00,"Islamabad, Pakistan",10,4,241,https://twitter.com/AAAPORG/status/12028977947...
1,AAAPORG,b'We stands with the students marching for the...,2019-11-29 07:06:57+00:00,"Islamabad, Pakistan",6,0,241,https://twitter.com/AAAPORG/status/12003101619...


In [4]:
print(df["username"].nunique())
print(df.count()) # There are this many tweets, but some of them are duplicates-retweets.etc

264
username     1224446
tweet        1224446
date         1224446
location      795230
likes        1224446
retweets     1224446
followers    1224446
tweet_url    1224446
dtype: int64


In [5]:
df=df.drop_duplicates(subset='tweet', keep='first')
print(df.nunique()) #delete duplicate tweets

username         264
tweet        1183431
date         1145913
location         183
likes           1686
retweets         626
followers        268
tweet_url    1183431
dtype: int64


In [6]:
# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(["amp","u","tweet","retweet","twitter","many","today","time","thing","good","humanists","amp","retweets","tweets"])

In [7]:
def clean_tweet(tweet):
    punctuation = string.punctuation + '”'
    tweet = tweet.lower()
    tweet = contractions.fix(tweet)
    tweet = re.sub(r'^b(?:"|\')?(.*)$', r'\1', tweet)
    tweet = re.sub(r"\\n", ' ', tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'#\s*\w+', '', tweet)
    tweet = re.sub(r'@\s*\w+', '', tweet)
    tweet = re.sub(r'\\[a-z0-9]{3}', '', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)  # Remove non-ASCII characters
    tweet = tweet.encode('ascii', 'ignore').decode('utf-8')  # Remove non-ASCII characters
    tweet = re.sub(r'\b\d+\b', '', tweet)  # Remove all numbers
    tweet = tweet.translate(tweet.maketrans('', '', punctuation))
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    tweet = ' '.join([word for word in tweet.split() if len(word) > 3 or word in ["no", "not", "god", "end", "yes"]])
    return tweet


In [8]:
pd.set_option('display.max_colwidth', None) 
df.insert(df.columns.get_loc("tweet")+1, "clean_tweet", df["tweet"].map(lambda a:clean_tweet(a)))
df.head(2)

Unnamed: 0,username,tweet,clean_tweet,date,location,likes,retweets,followers,tweet_url
0,AAAPORG,b'This Pakistani lecturer has spent six years in prison and is facing the death sentence \xe2\x80\x93 for allegedly insulting the Prophet Mohammed.\n#freeJunaidHafeez',pakistani lecturer spent years prison facing death sentence allegedly insulting prophet mohammed,2019-12-06 10:29:17+00:00,"Islamabad, Pakistan",10,4,241,https://twitter.com/AAAPORG/status/1202897794702434304
1,AAAPORG,b'We stands with the students marching for their rights across Pakistan. Peaceful protest is a human right. #StudentSolidarityMarch https://t.co/TlkBlhvRVr',stands students marching rights across pakistan peaceful protest human right,2019-11-29 07:06:57+00:00,"Islamabad, Pakistan",6,0,241,https://twitter.com/AAAPORG/status/1200310161904164864


In [9]:
df.nunique()

username           264
tweet          1183431
clean_tweet    1001070
date           1145913
location           183
likes             1686
retweets           626
followers          268
tweet_url      1183431
dtype: int64

In [10]:
df=df.drop_duplicates(subset='clean_tweet', keep='first')
print(df.nunique()) #delete duplicate tweets again because there were a lot of tweets that are the same (only url links were different)

username           264
tweet          1001070
clean_tweet    1001070
date            968296
location           183
likes             1621
retweets           597
followers          268
tweet_url      1001070
dtype: int64


In [12]:
# Load Spacy Lemmatizer
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Preprocessing
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmas = [token.lemma_ for token in nlp(" ".join(tokens)) if token.pos_ in ['NOUN', 'ADJ']]  # Lemmatization with Spacy, allowed POS tags
    return " ".join(lemmas)

df["preprocessed_tweet"] = df["clean_tweet"].apply(preprocess_text)

# Remove duplicate rows based on "preprocessed_tweet" column
df.drop_duplicates(subset="preprocessed_tweet", inplace=True)

# Create bigrams and trigrams
corpus = df["preprocessed_tweet"].apply(str.split)
phrases = Phrases(corpus, min_count=100, threshold=150, connector_words=ENGLISH_CONNECTOR_WORDS)
bigram = Phraser(phrases)
corpus = corpus.apply(lambda x: bigram[x])
trigram = Phrases(corpus, min_count=100, threshold=150, connector_words=ENGLISH_CONNECTOR_WORDS)
trigram = Phraser(trigram)
corpus = corpus.apply(lambda x: trigram[bigram[x]])

# Convert corpus to a list of strings
corpus = corpus.apply(lambda x: ' '.join(x)).tolist()

In [13]:
# Save the DataFrame to a CSV file
df.to_csv('all_dataset_lemmatized.csv', index=False)

# Save the corpus to a text file
with open('corpus.txt', 'w') as file:
    file.write('\n'.join(corpus))
