In [1]:
import os
import shutil
import tarfile
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
clim = pd.read_csv('Climate_twitter.csv')
bt = pd.read_csv('twitter_sentiment_data.csv')
json = pd.read_json('train.jsonl', lines=True)

In [3]:
# adding label based on polarity score
conditions = [
    (clim['polarity'] < 0),
    (clim['polarity'] > 0),
    (clim['polarity'] == 0)]

values = [0,1, 'neutral']

clim['label'] = np.select(conditions, values)

clim.head()

Unnamed: 0,id,date,retweets,source,author,likes,text,twitter_name,location,verified,followers,friends,polarity,subjectivity,label
0,2184934963,2020-12-22 23:22:20,71,Twitter Web App,GO GREEN,91,The death of summer Arctic ice our Earth coole...,ECOWARRIORSS,,False,23415,20439,-0.054365,0.426984,0
1,508658626,2020-12-10 14:30:00,14,Twitter for Advertisers,Elsevier Energy,98,Elsevier and the EditorsinChief are pleased to...,ElsevierEnergy,"Oxford, England",False,6615,508,0.3875,0.633333,1
2,2607105006,2020-12-22 21:28:52,0,Twitter Web App,Arwyn Thomas,1,From better climate change education to improv...,siwarr5,Carmarthen,False,22,133,0.261905,0.345238,1
3,19609660,2020-12-22 21:24:10,0,Twitter Web App,"Tom Gillispie, EDITOR/WRITER",0,climate change Links to FIXING CLIMATE CHANGE ...,EDITORatWORK,"Rural Hall, North Carolina, USA",False,4191,3708,0.0,0.0,neutral
4,19609660,2020-12-21 22:52:09,1,Twitter Web App,"Tom Gillispie, EDITOR/WRITER",1,climate change The 11TH HOUR FOR THE EARTH cli...,EDITORatWORK,"Rural Hall, North Carolina, USA",False,4191,3708,0.0,0.0,neutral


In [4]:
# dropping neutral entries
clim = clim.drop(clim[clim['label'] == 'neutral'].index)

In [5]:
# dropping all columns except text and label
clim = clim.drop(columns = ['id','date','source','author','twitter_name','location','verified','retweets','likes','followers','friends','polarity','subjectivity'])

In [6]:
clim.info()
clim['text'] = clim['text'].astype('string')
clim['label'] = clim['label'].astype('int64')
clim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 0 to 395
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    270 non-null    object
 1   label   270 non-null    object
dtypes: object(2)
memory usage: 6.3+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 0 to 395
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    270 non-null    string
 1   label   270 non-null    int64 
dtypes: int64(1), string(1)
memory usage: 6.3 KB


In [7]:
bt.head()
bt = bt.drop(columns=['tweetid'])

In [8]:
bt.rename(columns={'sentiment': "label", 'message':'text'}, inplace=True)

In [9]:
bt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43943 entries, 0 to 43942
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   43943 non-null  int64 
 1   text    43943 non-null  object
dtypes: int64(1), object(1)
memory usage: 686.7+ KB


In [10]:
bt['label'].value_counts() # imbalance

label
 1    22962
 2     9276
 0     7715
-1     3990
Name: count, dtype: int64

In [11]:
bt = bt.drop(bt[bt['label'] ==  2].index)

In [12]:
bt = bt.drop(bt[bt['label'] == 0].index)

In [13]:
bt.dropna(subset=['label'], inplace=True)

In [14]:
conditions = [
    (bt['label'] == -1),
    (bt['label'] == 1)]

values = [0,1]

bt['sent'] = np.select(conditions, values)

In [15]:
bt['label'] = bt['sent']
bt=bt.drop(columns=['sent'])

In [16]:
bt['text'] = bt['text'].astype("string")
bt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26952 entries, 0 to 43942
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   26952 non-null  int64 
 1   text    26952 non-null  string
dtypes: int64(1), string(1)
memory usage: 631.7 KB


In [17]:
bt['label'].value_counts()

label
1    22962
0     3990
Name: count, dtype: int64

In [18]:
json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2117 entries, 0 to 2116
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2117 non-null   object
 1   label   2117 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 33.2+ KB


In [19]:
json['label'].value_counts()

label
0    1585
1     532
Name: count, dtype: int64

In [20]:
df_list = [clim, bt, json]
df = pd.concat(df_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29339 entries, 0 to 2116
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    29339 non-null  object
 1   label   29339 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 687.6+ KB


In [21]:
df['text'] = df['text'].astype('string')

In [22]:
df['label'].value_counts()

label
1    23684
0     5655
Name: count, dtype: int64

#### Dataset Preprocessing: Filteration

In [23]:
import nltk

# Download the lexicon
nltk.download("vader_lexicon")

# Import the lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tessanderson/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
nltk.download('punkt')  # Download the punkt tokenizer if not already downloaded

df['tokens'] = df['text'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tessanderson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
keywords = ["sustainability", "environmental", "conservation", "recycling", "sustainable",
                         "responsibility", "green", "eco-friendly", "renewable", "carbon", "climate", "ecosystem",
                         "planet", "biodiversity", "energy", "water", "pollution", "reduction", "renewability",
                         "ecological", "renewable", "greenhouse", "clean", "solar", "wind", "earth", "sustainable",
                         "planet", "ecology", "ocean", "forest", "organic", "earth-friendly", "bio", "ethics",
                         "conservationist", "sustain", "renew", "ethical", "greenery", "saver", "sustainable",
                         "conservator", "recycler", "biodegradable", "natural", "greenery", "environment",
                         "saver", "earth-saving", "sustainability", "green-living", "clean", "responsible",
                         "preservation", "regeneration", "ecosystem", "safeguarding"]
filtered_df = df[df['tokens'].apply(lambda tokens: any(keyword in tokens for keyword in keywords))]

In [26]:
filtered_df['label'].value_counts()

label
1    19287
0     2412
Name: count, dtype: int64

#### When we are using an imbalanced dataset, we can oversample the minority class using replacement. Thsi tehnique is called oversampling.

In [27]:
from sklearn.utils import resample
df1 = filtered_df.loc[filtered_df['label'] == 0]
df2 = filtered_df.loc[filtered_df['label'] == 1]
df1_sampled=resample(df1, replace=True,
                     n_samples= 19287,
                     random_state=42)
# reproducible results
filtered_df= pd.concat([df1_sampled, df2])
filtered_df.info()
filtered_df['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 38574 entries, 20004 to 2115
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    38574 non-null  string
 1   label   38574 non-null  int64 
 2   tokens  38574 non-null  object
dtypes: int64(1), object(1), string(1)
memory usage: 1.2+ MB


label
0    19287
1    19287
Name: count, dtype: int64

#### Splitting the data: train, test, and validation

In [28]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X = filtered_df.text.values
y = filtered_df.label.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
vec.fit(X_train)
X_train=vec.transform(X_train)
X_test=vec.transform(X_test)

In [30]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)

In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

In [32]:
max_len = 128
#tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(filtered_df['text'].tolist(),
                                              padding = True,
                                              truncation = True,
                                              max_length = max_len,
                                              return_tensors = 'tf')

X_val_encoded = tokenizer.batch_encode_plus(X_val.tolist(),
                                            padding = True,
                                            truncation = True,
                                            max_length = max_len,
                                            return_tensors = 'tf')

X_test_encoded = tokenizer.batch_encode_plus(X_test.tolist(),
                                             padding = True,
                                             truncation = True,
                                             max_length = max_len,
                                             return_tensors = 'tf')

In [None]:
k = 0
print("Training comments -->", filtered_df.text[k])
print("\nInput IDs -->\n", X_train_encoded['input_ids'][k])
print("\nDecoded IDs -->\n", tokenizer.decode(X_train_encoded['input_ids'][k]))
print("\nAttention Mask -->\n", X_train_encoded['attention_mask'][k])
print("\nLabels -->", filtered_df.label[k])