In [1]:
import pandas as pd
import json
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk

In [28]:
# Read the CSV file into a DataFrame
df = pd.read_csv('emotion.csv')

# Count the occurrences of each emotion class
class_counts = df['emotion'].value_counts()

# Print the result
print("Class\t\tCount")
print("--------------------")
sum = 0
for emotion_class, count in class_counts.items():
    sum += count
    print(f"{emotion_class}\t\t{count}")
    
print("sum: ", sum)


Class		Count
--------------------
joy		516017
anticipation		248935
trust		205478
sadness		193437
disgust		139101
fear		63999
surprise		48729
anger		39867
sum:  1455563


In [29]:
print(df.shape[0])
df.head()

1455563


Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation


In [30]:
# Read the JSON lines into a list
json_data = []
with open('tweets_DM.json', 'r') as file:
    for line in file:
        json_data.append(json.loads(line))

# Create a DataFrame from the JSON data
df_json = pd.json_normalize(json_data, sep='_source_')

# Extract the relevant columns
df_json = df_json[['source_tweet_source_tweet_id', 'source_tweet_source_text']]

# Rename columns for consistency
df_json.columns = ['tweet_id', 'text']

# Merge the two DataFrames based on the tweet_id
df = pd.merge(df, df_json, on='tweet_id', how='left')

In [31]:
print(df.head())
print(df.shape[0])
# (df['identification'] == 'train').sum()

   tweet_id       emotion                                               text
0  0x3140b1       sadness                             Why Chester? <LH> <LH>
1  0x368b73       disgust  @JaredLeto you are the fish that Jonah.  Excep...
2  0x296183  anticipation  He is coming back again and gonna come again q...
3  0x2bd6e1           joy  Dei is really such a beautiful person inside &...
4  0x2ee1dd  anticipation  Expressive praise is also an expression of fai...
1455563


In [32]:
# Read the CSV file into a DataFrame
test_train = pd.read_csv('data_identification.csv')

# Merge the two DataFrames based on the tweet_id
df = pd.merge(df, test_train, on='tweet_id', how='left')

test_df = test_train[test_train['identification']=='test']
test_df = pd.merge(test_df, df_json, on='tweet_id', how='left')

In [33]:
print(df.head())
(df['identification'] == 'train').sum()


   tweet_id       emotion                                               text  \
0  0x3140b1       sadness                             Why Chester? <LH> <LH>   
1  0x368b73       disgust  @JaredLeto you are the fish that Jonah.  Excep...   
2  0x296183  anticipation  He is coming back again and gonna come again q...   
3  0x2bd6e1           joy  Dei is really such a beautiful person inside &...   
4  0x2ee1dd  anticipation  Expressive praise is also an expression of fai...   

  identification  
0          train  
1          train  
2          train  
3          train  
4          train  


1455563

In [34]:
print(test_df.head())
test_df.shape

   tweet_id identification                                               text
0  0x28cc61           test  @Habbo I've seen two separate colours of the e...
1  0x2db41f           test  @FoxNews @KellyannePolls No serious self respe...
2  0x2466f6           test  Looking for a new car, and it says 1 lady owne...
3  0x23f9e9           test  @cineworld “only the brave” just out and fount...
4  0x1fb4e1           test  Felt like total dog 💩 going into open gym and ...


(411972, 3)

In [35]:
print(df[df['identification'] == 'test'])

Empty DataFrame
Columns: [tweet_id, emotion, text, identification]
Index: []


#### Naive Bayes

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
train_df = df
test_df = test_df

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])
y_train = train_df['emotion']

X_test = vectorizer.transform(test_df['text'])
# y_test = test_df['emotion']

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = classifier.predict(X_test)


In [None]:
submission = pd.DataFrame()
submission['id'] = test_df['tweet_id']
submission['emotion'] = predictions
print(submission.head())

submission.to_csv('submission_tfidf.csv', index=False)
submission.shape

         id  emotion
0  0x28cc61      joy
1  0x2db41f  sadness
2  0x2466f6      joy
3  0x23f9e9      joy
4  0x1fb4e1      joy


(411972, 2)

#### Decision Tree

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'df' is your original DataFrame with 'text' and 'emotion' columns
# Split the data into training and testing sets
train_df = df
test_df = test_df

vectorizer = CountVectorizer(max_features=700, tokenizer=nltk.word_tokenize)
X_train = vectorizer.fit_transform(train_df['text'])
y_train = train_df['emotion']

X_test = vectorizer.transform(test_df['text'])
# y_test = test_df['emotion']

# Train a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=30, min_samples_leaf = 8, max_depth = 16)
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set using Decision Tree
dt_predictions = dt_classifier.predict(X_test)

submission = pd.DataFrame()
submission['id'] = test_df['tweet_id']
submission['emotion'] = dt_predictions
print(submission.head())

submission.to_csv('submission_tree.csv', index=False)
submission.shape


         id  emotion
0  0x28cc61  sadness
1  0x2db41f      joy
2  0x2466f6      joy
3  0x23f9e9      joy
4  0x1fb4e1      joy


(411972, 2)

#### hugging face

In [13]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

  from .autonotebook import tqdm as notebook_tqdm


#### neutral to be anticipation or trust

In [20]:
submission = pd.read_csv('submission.csv')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# List of anticipation verbs
anticipation_verbs = ['anticipate','anticipat' ,'hop', 'hope', 'yearn', 'crave', 'envision', 'wish', 'aspire']

for index in range(411000, 411972):
    id = submission.loc[index, 'id']
    if index % 1000 == 0:
        submission.to_csv('submission.csv', index=False)
        print(index)
        
    t = test_df[test_df['tweet_id'] == id]['text'].values[0]
    predict = classifier(t)
    result =  max(predict[0], key=lambda x: x['score'])['label'] 

    
    if result == 'neutral':
        # Tokenize the text into words
        words = word_tokenize(t)
        pos_tags = pos_tag(words)
        verbs = [lemmatizer.lemmatize(word, pos='v') for word, pos in pos_tags if pos.startswith('V')]
        lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in verbs]
        
        # Check if any stemmed words match anticipation verbs
        matches = [word for word in lemmatized_words if word in anticipation_verbs]
        
        if matches:
            submission.loc[submission['id'] == id, 'emotion'] = 'anticipation'
        else:
            submission.loc[submission['id'] == id, 'emotion'] = 'trust'
    else:
        submission.loc[submission['id'] == id, 'emotion'] = result
            
submission.to_csv('submission.csv', index=False)
submission.shape
            

411000


(411972, 2)

#### neutral to be the prediction of tfidf

In [None]:
submission = pd.read_csv('submission.csv')
submission_tfidf = pd.read_csv('submission_tfidf.csv')

for index in range(0, 411972):
    id = submission.loc[index, 'id']
    if index % 1000 == 0:
        submission.to_csv('sub.csv', index=False)
        print(index)
    
    if submission.loc[index, 'emotion'] == 'anticipation' or submission.loc[index, 'emotion'] == 'trust':
        submission.loc[submission['id'] == id, 'emotion'] = submission_tfidf.loc[index, 'emotion']
            
submission.to_csv('sub.csv', index=False)
submission.shape