# Import Library

In [1]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load Data

In [2]:
# Read data
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_id = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')

# Build a new dataframe by selecting useful features.

In [3]:
# Create DataFrame from JSON
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'text': _source.apply(lambda x: x['text']),
})

# Merge emotion and data_identification
df = df.merge(emotion, on='tweet_id', how='left')  # Add emotion column
df = df.merge(data_id, on='tweet_id', how='left')  # Add identification column

# Display resulting DataFrame
# Check if all columns are included: tweet_id, text, emotion, identification
print(df.head())  

# Split into train_data and test_data
train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

# Verify splits
print(f"Train data: {len(train_data)} rows")
print(f"Test data: {len(test_data)} rows")

   tweet_id                                               text       emotion  \
0  0x376b20  People who post "add me on #Snapchat" must be ...  anticipation   
1  0x2d5350  @brianklaas As we see, Trump is dangerous to #...       sadness   
2  0x28b412  Confident of your obedience, I write to you, k...           NaN   
3  0x1cd5b0                Now ISSA is stalking Tasha 😂😂😂 <LH>          fear   
4  0x2de201  "Trust is not the same as faith. A friend is s...           NaN   

  identification  
0          train  
1          train  
2           test  
3          train  
4           test  
Train data: 1455563 rows
Test data: 411972 rows


In [4]:
train_data.head(10)

Unnamed: 0,tweet_id,text,emotion,identification
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,train
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,train
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,train
5,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,train
6,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,train
7,0x368e95,Love knows no gender. 😢😭 <LH>,joy,train
8,0x249c0c,@DStvNgCare @DStvNg More highlights are being ...,sadness,train
10,0x359db9,The #SSM debate; <LH> (a manufactured fantasy ...,anticipation,train
11,0x23b037,I love suffering 🙃🙃 I love when valium does no...,joy,train
12,0x1fde89,Can someone tell my why my feeds scroll back t...,anger,train


In [5]:
test_data.head(10)

Unnamed: 0,tweet_id,text,emotion,identification
2,0x28b412,"Confident of your obedience, I write to you, k...",,test
4,0x2de201,"""Trust is not the same as faith. A friend is s...",,test
9,0x218443,When do you have enough ? When are you satisfi...,,test
30,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",,test
33,0x26289a,"In these tough times, who do YOU turn to as yo...",,test
35,0x31c6e0,Turns out you can recognise people by their un...,,test
37,0x32edee,"I like how Hayvens mommy, daddy, and the keybo...",,test
46,0x3714ee,I just love it when every single one of my son...,,test
49,0x235628,@JulieChen when can we expect a season of #Cel...,,test
56,0x283024,Tbh. Regret hurts more than stepping on a LEGO...,,test


### Droping tweet_id and identification, because it isn't going to be features used on training

In [6]:
y_train_data = train_data['emotion']
X_train_data = train_data.drop(['tweet_id', 'emotion', 'identification'], axis=1)

In [7]:
X_train_data

Unnamed: 0,text
0,"People who post ""add me on #Snapchat"" must be ..."
1,"@brianklaas As we see, Trump is dangerous to #..."
3,Now ISSA is stalking Tasha 😂😂😂 <LH>
5,@RISKshow @TheKevinAllison Thx for the BEST TI...
6,Still waiting on those supplies Liscus. <LH>
...,...
1867526,I'm SO HAPPY!!! #NoWonder the name of this sho...
1867527,In every circumtance I'd like to be thankful t...
1867528,there's currently two girls walking around the...
1867533,"Ah, corporate life, where you can date <LH> us..."


In [8]:
y_train_data

0          anticipation
1               sadness
3                  fear
5                   joy
6          anticipation
               ...     
1867526             joy
1867527             joy
1867528             joy
1867533             joy
1867534             joy
Name: emotion, Length: 1455563, dtype: object

In [9]:
from sklearn.model_selection import train_test_split
# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data) 

In [10]:
X_train

Unnamed: 0,text
1488945,@HarmsWayChad Yes you are so right! <LH>
824636,"When I see who He is, I know who I am.. <LH> <..."
1246682,@Interscope @LanaDelRey Love <LH> !
1158787,the cutest boy just took my order at starbucks...
1618785,God Thank you for the successful of #AidForHum...
...,...
1275457,@ArthurSchwartz @HillelSims Joy Reed is a <LH>...
1537386,Happiness will come to you when it comes from ...
1641296,@Jaspritbumrah93 @msdhoni You r <LH> MS @msdho...
533531,so awesome seeing @chefjoseandres and other re...


In [11]:
y_train

1488945        joy
824636         joy
1246682        joy
1158787        joy
1618785        joy
            ...   
1275457    disgust
1537386        joy
1641296        joy
533531       trust
361133         joy
Name: emotion, Length: 1164450, dtype: object

In [12]:
X_test

Unnamed: 0,text
302181,"@TobiasTuti #faith, #hope, and <LH> will last ..."
1134130,HBO/HBOgo Should just drop the whole season ju...
71178,✴Give us a try!✴ Looking for a particular ser...
744197,Feels like the end of September. <LH>
19011,What is it about our culture that has made sex...
...,...
902293,@PDPhillipsJa @PSimpsonMiller @DrShaneAlexis @...
883664,literally so <LH> to have Zofija in my flat lol
1494184,GAMEDAY!Bell Creek @ CCS @ 4. John 17:4 #CCC <LH>
1754951,@wikiwachee @JulianCastro Was wondering the sa...


In [13]:
y_test

302181         joy
1134130       fear
71178          joy
744197         joy
19011      sadness
            ...   
902293     disgust
883664         joy
1494184      trust
1754951      trust
911498     sadness
Name: emotion, Length: 291113, dtype: object

In [14]:
tfidf = TfidfVectorizer(max_features=1000) # Use tfidfVectorizer and remove stop_words.
X = tfidf.fit_transform(X_train['text']).toarray()
X_test = tfidf.transform(X_test['text'])

In [15]:
le = LabelEncoder() # Label target
y = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=50,         # Description: The number of trees in the forest. Default: 100
    max_depth=10,             # Limit tree depth
    random_state=42,          # Ensure reproducibility
    n_jobs=-1,                # Use all processors for faster computation
)

clf.fit(X, y)
model = clf

## predict

In [17]:
# train
y_pred = model.predict(X) # Predict
# test
y_test_pred = model.predict(X_test) # X_test has not been seen in the model before

In [18]:
## accuracy
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=y, y_pred=y_pred) # since x_train has been seen in the training process
acc_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)

print('training accuracy: {}'.format(round(acc_train, 2))) 
print('testing accuracy: {}'.format(round(acc_test, 2)))

training accuracy: 0.39
testing accuracy: 0.39


# Deal with test data

### Convert test data into the form that same as train data.

In [19]:
test_data = df[df['identification'] == 'test'] # Get the test data from df

In [20]:
# Do the same thing as training stage, but here we don't have emotions feature.
X_test_data = test_data.drop(['tweet_id', 'identification'], axis=1)

In [21]:
X_test_data

Unnamed: 0,text,emotion
2,"Confident of your obedience, I write to you, k...",
4,"""Trust is not the same as faith. A friend is s...",
9,When do you have enough ? When are you satisfi...,
30,"God woke you up, now chase the day #GodsPlan #...",
33,"In these tough times, who do YOU turn to as yo...",
...,...,...
1867525,"""For this is the message that ye heard from th...",
1867529,"""There is a lad here, which hath five barley l...",
1867530,When you buy the last 2 tickets remaining for ...,
1867531,I swear all this hard work gone pay off one da...,


In [22]:
X_test_data = tfidf.transform(X_test_data['text']).toarray() # Convert test data by using same tfidfVectorizer

In [23]:
X_test_data

array([[0.        , 0.        , 0.        , ..., 0.17667198, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [24]:
y_test_pred = model.predict(X_test_data)

In [25]:
y_pred_labels = le.inverse_transform(y_test_pred) # Inverse predict labels back to adjective words

## Make up submit format

In [26]:
len(test_data['tweet_id'])

411972

In [27]:
y_pred_labels

array(['joy', 'joy', 'joy', ..., 'joy', 'joy', 'joy'], dtype=object)

In [28]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': y_pred_labels
})



In [29]:
submission.to_csv('/kaggle/working/submission.csv', index=False)