In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd "/content/drive/MyDrive/IR/Final Project"

/content/drive/MyDrive/IR/Final Project


# Import Libraries and Load Data

In [5]:
import pandas as pd
import json
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

nltk.download('stopwords')

# load data
data_path = '/content/drive/MyDrive/IR/Final Project/IRWA_data_2024/data/farmers-protest-tweets.json'

# read file & parse each JSON object
tweets_data = []
with open(data_path, 'r') as file:
    for line in file:
        try:
            data = json.loads(line)
            tweets_data.append(data)
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line: {e}")

# dataframe
tweets_df = pd.DataFrame(tweets_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Pre-Processing Function

In [6]:
def preprocess_tweet_combined(tweet):
    # init stemmer
    stemmer = PorterStemmer()

    # change to lowercase
    tweet = tweet.lower()

    # remove URLs
    tweet = ' '.join(word for word in tweet.split() if not word.startswith('http'))

    # tokenization
    tokens = tweet.split()

    # separate hashtags from '#'
    tokens = [word[1:] if word.startswith('#') else word for word in tokens]

    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    # remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(stemmed_tokens)

# preprocessing
tweets_df['processed_content'] = tweets_df['content'].apply(preprocess_tweet_combined)


In [10]:
# Select relevant columns including the processed content
relevant_columns = ['content', 'processed_content', 'url', 'date', 'likeCount', 'retweetCount']

# Display the first 5 tweets with all relevant information as a table
print("First 5 Tweets with All Information:")
display(tweets_df[relevant_columns])  # Display as a table in Jupyter Notebook

First 5 Tweets with All Information:


Unnamed: 0,content,processed_content,url,date,likeCount,retweetCount
0,The world progresses while the Indian police a...,world progress indian polic govt still tri tak...,https://twitter.com/ArjunSinghPanam/status/136...,2021-02-24T09:23:35+00:00,0,0
1,#FarmersProtest \n#ModiIgnoringFarmersDeaths \...,farmersprotest modiignoringfarmersdeath modido...,https://twitter.com/PrdeepNain/status/13645062...,2021-02-24T09:23:32+00:00,0,0
2,ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇ...,ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ ਮੇਰੇ...,https://twitter.com/parmarmaninder/status/1364...,2021-02-24T09:23:22+00:00,0,0
3,@ReallySwara @rohini_sgh watch full video here...,@reallyswara @rohini_sgh watch full video farm...,https://twitter.com/anmoldhaliwal/status/13645...,2021-02-24T09:23:16+00:00,0,0
4,#KisanEktaMorcha #FarmersProtest #NoFarmersNoF...,kisanektamorcha farmersprotest nofarmersnofood,https://twitter.com/KotiaPreet/status/13645061...,2021-02-24T09:23:10+00:00,0,0
...,...,...,...,...,...,...
117402,#FarmersProtest #KisanAndolan #KisaanMajdoorEk...,farmersprotest kisanandolan kisaanmajdoorektaz...,https://twitter.com/rickyrickstir/status/13600...,2021-02-12T01:37:02+00:00,0,0
117403,PM मोदी की अपील के बीच संयुक्त किसान मोर्चा का...,pm मोदी की अपील के बीच संयुक्त किसान मोर्चा का...,https://twitter.com/PunjabTak/status/136004014...,2021-02-12T01:36:53+00:00,0,0
117404,United we stand.\nDivided we fall\n#Mahapancha...,unit stand. divid fall mahapanchayatrevolut fa...,https://twitter.com/ish_kayy/status/1360040134...,2021-02-12T01:36:50+00:00,39,65
117405,"सिंघु बॉर्डर पर लंबी लड़ाई की तैयारी, किसानों ...","सिंघु बॉर्डर पर लंबी लड़ाई की तैयारी, किसानों ...",https://twitter.com/TV9Bharatvarsh/status/1360...,2021-02-12T01:36:49+00:00,15,1
