# Import Libraries and Load Data

In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer # Includes support for different sets of stop words and tokenizers that are better suited for languages other than English
import string
from langdetect import detect, LangDetectException # To detect different languages
# Another approach could be to use a public API to translate the content from any word to english and to conduct the whole process in english
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm

In [2]:
# Download NLTK resources
nltk.download('punkt')
# nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/yuyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yuyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preprocessing

In [3]:
# Languages supported by both SnowballStemmer and Word_tokenize; Otherwise it will give error
lang_map = {
    'en': 'english',        # English
    'es': 'spanish',        # Spanish
	'fr': 'french',         # French
    'de': 'german',         # German
    'da': 'danish',         # Danish
	'nl': 'dutch',          # Dutch
	'it': 'italian',        # Italian
    'fi': 'finnish',        # Finnish
    'ru': 'russian',        # Russian
	'el': 'greek',          # Greek
	'no': 'norwegian',      # Norwegian
	'pt': 'portuguese',     # Portuguese
	'sv': 'swedish'        # Swedish
}


In [4]:
def preprocess_text(text, lang_code='en', language_mapping=lang_map):
    # Map language code to SnowballStemmer language name
    language = language_mapping.get(lang_code, 'english')  # Default to 'english' if lang_code is unsupported
    
    text = text.lower()

    tokens = word_tokenize(text, language=language)
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [w.translate(table) for w in tokens]
    
    # Remove non-alphabetic tokens
    words = [word for word in stripped_tokens if word.isalpha()]

    #TODO: remove what comes after an @; it will make the content look strange after translation
    
    # Remove stop words for the detected language
    try:
        stop_words = set(stopwords.words(language))
    except OSError:
        stop_words = set()
    words = [w for w in words if not w in stop_words]
    
    # Stemming
    try:
        stemmer = SnowballStemmer(language)
        stemmed = [stemmer.stem(word) for word in words]
    except Exception as e:
        print(f"Stemming not performed due to: {e}")
        stemmed = words  # Fallback to non-stemmed words if stemming fails

    return stemmed

In [5]:
def process_tweets(json_file):
    processed_tweets = []

    with open(json_file, 'r') as file:
        total_lines = sum(1 for line in file)

    # Open the file and read it line by line
    with open(json_file, 'r') as file:
        # i = 0
        for line in tqdm(file, total=total_lines, desc="Processing Tweets", unit="tweet"):
            try:
                tweet = json.loads(line)
                
                # Attempt to detect language
                try:
                    lang = detect(tweet.get('content', ''))
                except LangDetectException:
                    lang = 'unsure'  # Set to 'unsure' if language detection fails

                # print(lang, tweet)

                lang_of_process = 'english' if lang == 'unsure' else lang # Use English as the default language if unsure

                # Process the tweet content
                content = tweet.get('content', '')
                processed_content = preprocess_text(content, lang_code=lang_of_process)
                
                # Extract hashtags and handle them separately if needed
                hashtags = [word for word in content.split() if word.startswith('#')] #TODO: See what can we do with hashtags
                
                # Build the processed tweet information
                processed_tweet = {
                    'id': tweet.get('id', None),
                    'content': ' '.join(processed_content),
                    'date': tweet.get('date'),
                    'hashtags': ' '.join(hashtags),
                    'likes': tweet.get('likeCount'),
                    'retweets': tweet.get('retweetCount'),
                    'url': tweet.get('url'),
                    'language': lang  # Store the detected or default language
                }
                
                processed_tweets.append(processed_tweet)
                # i += 1
                # if i > 50:  # Limit to 10 tweets for demonstration
                #     break
            except json.JSONDecodeError:
                print(f"Failed to parse line: {line}")
                continue

    return processed_tweets

In [6]:
# Correct path to your JSON file
json_file_path = './data/farmers-protest-tweets.json'
processed_data = process_tweets(json_file_path)

Processing Tweets: 100%|██████████| 117405/117405 [09:57<00:00, 196.34tweet/s]


In [7]:
print(processed_data[:10])


[{'id': 1364506195453767680, 'content': 'ਆ ਚ farmersprotest', 'date': '2021-02-24T09:23:22+00:00', 'hashtags': '#FarmersProtest', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/parmarmaninder/status/1364506195453767680', 'language': 'pa'}, {'id': 1364506167226032128, 'content': 'reallyswara rohinisgh watch full video https farmersprotest nofarmersnofood https tcofustokocxk', 'date': '2021-02-24T09:23:16+00:00', 'hashtags': '#farmersprotest #NoFarmersNoFood', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/anmoldhaliwal/status/1364506167226032128', 'language': 'en'}, {'id': 1364506144002088963, 'content': 'kisanektamorcha farmersprotest nofarmersnofood https', 'date': '2021-02-24T09:23:10+00:00', 'hashtags': '#KisanEktaMorcha #FarmersProtest #NoFarmersNoFood', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/KotiaPreet/status/1364506144002088963', 'language': 'en'}, {'id': 1364506120497360896, 'content': 'jai jwaan jai kissan farmersprotest modiignoringfarmersdeath 

In [13]:
# Map tweetId with documentId
df = pd.DataFrame(processed_data)
doc_mapping = pd.read_csv('./data/tweet_document_ids_map.csv') 

# Merge the processed tweets DataFrame with the document mapping on the 'id' column
df_merged = pd.merge(df, doc_mapping, on='id', how='left')
df_merged.head()

Unnamed: 0,id,content,date,hashtags,likes,retweets,url,language,docId
0,1364506195453767680,ਆ ਚ farmersprotest,2021-02-24T09:23:22+00:00,#FarmersProtest,0,0,https://twitter.com/parmarmaninder/status/1364...,pa,
1,1364506167226032128,reallyswara rohinisgh watch full video https f...,2021-02-24T09:23:16+00:00,#farmersprotest #NoFarmersNoFood,0,0,https://twitter.com/anmoldhaliwal/status/13645...,en,doc_2
2,1364506144002088963,kisanektamorcha farmersprotest nofarmersnofood...,2021-02-24T09:23:10+00:00,#KisanEktaMorcha #FarmersProtest #NoFarmersNoFood,0,0,https://twitter.com/KotiaPreet/status/13645061...,en,
3,1364506120497360896,jai jwaan jai kissan farmersprotest modiignori...,2021-02-24T09:23:05+00:00,#FarmersProtest #ModiIgnoringFarmersDeaths,0,0,https://twitter.com/babli_708/status/136450612...,en,
4,1364506076272496640,farmersprotest,2021-02-24T09:22:54+00:00,#FarmersProtest,0,0,https://twitter.com/Varinde17354019/status/136...,en,


In [14]:
# Save the data in CSV format
df_merged.to_csv('processed_data.csv', index=False)

# Explanatory Data Analysis

In [16]:
df = pd.read_csv("./processed_data.csv")

df.head(20)
# TODO: Ask if all tweets are by default related with a document or can it be null

Unnamed: 0,id,content,date,hashtags,likes,retweets,url,language,docId
0,1364506195453767680,ਆ ਚ farmersprotest,2021-02-24T09:23:22+00:00,#FarmersProtest,0,0,https://twitter.com/parmarmaninder/status/1364...,pa,
1,1364506167226032128,reallyswara rohinisgh watch full video https f...,2021-02-24T09:23:16+00:00,#farmersprotest #NoFarmersNoFood,0,0,https://twitter.com/anmoldhaliwal/status/13645...,en,doc_2
2,1364506144002088963,kisanektamorcha farmersprotest nofarmersnofood...,2021-02-24T09:23:10+00:00,#KisanEktaMorcha #FarmersProtest #NoFarmersNoFood,0,0,https://twitter.com/KotiaPreet/status/13645061...,en,
3,1364506120497360896,jai jwaan jai kissan farmersprotest modiignori...,2021-02-24T09:23:05+00:00,#FarmersProtest #ModiIgnoringFarmersDeaths,0,0,https://twitter.com/babli_708/status/136450612...,en,
4,1364506076272496640,farmersprotest,2021-02-24T09:22:54+00:00,#FarmersProtest,0,0,https://twitter.com/Varinde17354019/status/136...,en,
5,1364505995859423234,modidontsellfarm farmersprotest https,2021-02-24T09:22:35+00:00,#ModiDontSellFarmers #FarmersProtest,0,0,https://twitter.com/BitnamSingh/status/1364505...,en,
6,1364505991887347714,watch full video https farmersprotest nofarmer...,2021-02-24T09:22:34+00:00,#farmersprotest #NoFarmersNoFood,0,0,https://twitter.com/anmoldhaliwal/status/13645...,en,doc_3
7,1364505896576053248,farmersprot https,2021-02-24T09:22:11+00:00,#FarmersProtest,0,0,https://twitter.com/SatThiara/status/136450589...,de,
8,1364505892612268032,wheatfield farmersprotest https tcoyxsvdprpgg,2021-02-24T09:22:10+00:00,#WheatField #FarmersProtest,1,1,https://twitter.com/PasumaiVikatan/status/1364...,ta,
9,1364505813834989568,watch full video https farmersprotest nofarmer...,2021-02-24T09:21:51+00:00,#farmersprotest #NoFarmersNoFood,0,0,https://twitter.com/anmoldhaliwal/status/13645...,en,doc_4
