# Project 1: Sentiment Analysis

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.metrics.distance import jaro_winkler_similarity
from sklearn.pipeline import Pipeline
from unidecode import unidecode
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
# Download WordNet if you haven't already
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('/work/Preprocessed/cleaned_data.csv')
df = df.rename(columns={'tweet': 'cleaned'})
df.head()

Unnamed: 0,sentimen,cleaned
0,negatif,kata prabowo indonesia tidak harga bangsa asin...
1,netral,batu langka tasbih jokowi hadiah dari habib lu...
2,netral,di era jokowi ekonomi indonesia makin baik ind...
3,positif,bagi sumatera selatan asi games dampak pd ekon...
4,negatif,negara kita ngutang buat bngun infrastruktur y...


### Text Tokenization

In [None]:
def tweet_tokenization(text):
    # Kode untuk melakukan tweet tokenization
    tokens = nltk.tokenize.TweetTokenizer().tokenize(text)
    return tokens

In [None]:
df['tokenization'] = df['cleaned'].apply(lambda x: tweet_tokenization(x))

In [None]:
df.head()

Unnamed: 0,sentimen,cleaned,tokenization
0,negatif,kata prabowo indonesia tidak harga bangsa asin...,"[kata, prabowo, indonesia, tidak, harga, bangs..."
1,netral,batu langka tasbih jokowi hadiah dari habib lu...,"[batu, langka, tasbih, jokowi, hadiah, dari, h..."
2,netral,di era jokowi ekonomi indonesia makin baik ind...,"[di, era, jokowi, ekonomi, indonesia, makin, b..."
3,positif,bagi sumatera selatan asi games dampak pd ekon...,"[bagi, sumatera, selatan, asi, games, dampak, ..."
4,negatif,negara kita ngutang buat bngun infrastruktur y...,"[negara, kita, ngutang, buat, bngun, infrastru..."


In [None]:
# Assuming 'df' is your DataFrame and 'sentimen' is the column you want to encode
y = df['sentimen']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'sentimen' column
y_encoded = label_encoder.fit_transform(y)

# Replace the 'sentimen' column in the DataFrame with the encoded values
y = y_encoded

### Text Vectorization

In [None]:
result = {}
proportion = [10,20,30,40,50]
no = 1

### Count Vectorization

Unigram

In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,1))
# Fit and transform the tokenized text data
X = vectorizer.fit_transform(df['cleaned'])
# Convert the sparse matrix to a DataFrame
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "Count Vectorizer",
        "param": "unigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


Unigram-Bigram

In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2))
# Fit and transform the tokenized text data
X = vectorizer.fit_transform(df['cleaned'])
# Convert the sparse matrix to a DataFrame
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "Count Vectorizer",
        "param": "unigram - bigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


Bigram

In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,2))
# Fit and transform the tokenized text data
X = vectorizer.fit_transform(df['cleaned'])
# Convert the sparse matrix to a DataFrame
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "Count Vectorizer",
        "param": "bigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


Trigram

In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(ngram_range=(3,3))
# Fit and transform the tokenized text data
X = vectorizer.fit_transform(df['cleaned'])
# Convert the sparse matrix to a DataFrame
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "Count Vectorizer",
        "param": "trigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


### TF-IDF

Unigram

In [None]:
# menggunakan data lebih banyak
vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf = vectorizer.fit_transform(df['cleaned'])
tfidf_array = tfidf.toarray()
X = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "TF-IDF",
        "param": "Unigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


Unigram - Bigram

In [None]:
# menggunakan data lebih banyak
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(df['cleaned'])
tfidf_array = tfidf.toarray()
X = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "TF-IDF",
        "param": "Unigram - Bigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


Bigram

In [None]:
# menggunakan data lebih banyak
vectorizer = TfidfVectorizer(ngram_range=(2,2))
tfidf = vectorizer.fit_transform(df['cleaned'])
tfidf_array = tfidf.toarray()
X = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "TF-IDF",
        "param": "Bigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1



Trigram

In [None]:
# menggunakan data lebih banyak
vectorizer = TfidfVectorizer(ngram_range=(3,3))
tfidf = vectorizer.fit_transform(df['cleaned'])
tfidf_array = tfidf.toarray()
X = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out()) # untuk scikit learn baru, menggunakan get_feature_names()
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

for i in proportion:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i/100, random_state=42)
    
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred,average='macro')

    result[no] = {
        "vectorization": "TF-IDF",
        "param": "Bigram",
        "test size": i,
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1


In [None]:
!pip install gensim==4.3.2


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from gensim.models import Word2Vec

In [None]:
X = df["tokenization"]
for i in range(1,6):
    for j in proportion:
        # Train a skip-gram Word2Vec model
        model = Word2Vec(sentences=X, vector_size=100, window=i, sg=0)

        # Function to get document embeddings
        def get_doc_embedding(doc):
            embeddings = [model.wv[word] for word in doc if word in model.wv]
            if embeddings:
                return np.mean(embeddings, axis=0)
            return np.zeros(100)  # Return zero vector if no embeddings found

        # Create document embeddings
        doc_embeddings = np.array([get_doc_embedding(doc) for doc in X])

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(doc_embeddings, y, test_size=j/100, random_state=42)

        # Train a Random Forest classifier
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = rf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred,average='macro')

        result[no] = {
            "vectorization": "CBOW",
            "param": "window {}".format(i),
            "test size": j,
            "f1": round(f1, 2),
            "accuracy": round(accuracy, 2),
        }
        no+=1

In [None]:
X = df["tokenization"]
for i in range(1,6):
    for j in proportion:
        # Train a skip-gram Word2Vec model
        model = Word2Vec(sentences=X, vector_size=100, window=i, sg=1)

        # Function to get document embeddings
        def get_doc_embedding(doc):
            embeddings = [model.wv[word] for word in doc if word in model.wv]
            if embeddings:
                return np.mean(embeddings, axis=0)
            return np.zeros(100)  # Return zero vector if no embeddings found

        # Create document embeddings
        doc_embeddings = np.array([get_doc_embedding(doc) for doc in X])

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(doc_embeddings, y, test_size=j/100, random_state=42)

        # Train a Random Forest classifier
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = rf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred,average='macro')

        result[no] = {
            "vectorization": "CBOW",
            "param": "window {}".format(i),
            "test size": j,
            "f1": round(f1, 2),
            "accuracy": round(accuracy, 2),
        }
        no+=1

In [None]:
result_lstm = {}

In [None]:
sentiment = pd.get_dummies(df["sentimen"])
df_baru = pd.concat([df, sentiment], axis=1)
df_baru = df_baru.drop(columns="sentimen")
df_baru.head()

Unnamed: 0,cleaned,tokenization,negatif,netral,positif
0,kata prabowo indonesia tidak harga bangsa asin...,"[kata, prabowo, indonesia, tidak, harga, bangs...",True,False,False
1,batu langka tasbih jokowi hadiah dari habib lu...,"[batu, langka, tasbih, jokowi, hadiah, dari, h...",False,True,False
2,di era jokowi ekonomi indonesia makin baik ind...,"[di, era, jokowi, ekonomi, indonesia, makin, b...",False,True,False
3,bagi sumatera selatan asi games dampak pd ekon...,"[bagi, sumatera, selatan, asi, games, dampak, ...",False,False,True
4,negara kita ngutang buat bngun infrastruktur y...,"[negara, kita, ngutang, buat, bngun, infrastru...",True,False,False


### LSTM

In [None]:
def RNN():
    inputs = Input(name='inputs', shape=(X.shape[1],))
    layer = Embedding(input_dim=1000, output_dim=50, input_length=X.shape[1])(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(3, name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [None]:
no = 1
for tipe in range(0,2):
    for window in range(2,4):
        print('-'*100)
        print(vectorization)
        print("Window {}".format(window))
        print('-'*100)
        # Train a skip-gram Word2Vec model
        model = Word2Vec(sentences=df_baru["tokenization"], vector_size=100, window=2, sg=tipe)

        # Function to get document embeddings
        def get_doc_embedding(doc):
            embeddings = [model.wv[word] for word in doc if word in model.wv]
            if embeddings:
                return np.mean(embeddings, axis=0)
            return np.zeros(100)  # Return zero vector if no embeddings found

        # Create document embeddings
        doc_embeddings = np.array([get_doc_embedding(doc) for doc in X])
        y = df_baru[['negatif', 'netral', 'positif']].values
        max_length = max(len(seq) for seq in doc_embeddings)
        padded_sequences = pad_sequences(doc_embeddings, maxlen=max_length, padding='post', dtype='float32')
        # Assuming y_train is your labels
        X = np.array(padded_sequences)
        y = np.array(y)
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model_rnn = RNN()
        model_rnn.compile(loss='categorical_crossentropy', optimizer=RMSprop(),metrics=['accuracy'])

        # Train the RNN model
        history = model_rnn.fit(X_train, y_train, batch_size=128, epochs=10, verbose=2,validation_split=0.2, 
        callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
        loss, accuracy = model_rnn.evaluate(X_test, y_test)
        # Predict labels for test data
        y_pred = model_rnn.predict(X_test)

        # Convert predicted probabilities to class labels
        y_pred_classes = np.argmax(y_pred, axis=1)

        # Convert one-hot encoded labels to single labels
        y_true = np.argmax(y_test, axis=1)

        # Calculate F1 score
        f1 = f1_score(y_true, y_pred_classes, average='weighted')
        if tipe == 0:
            vectorization = "CBOW"
        else:
            vectorization = "Skip Gram"

        result_lstm[no] = {
            "vectorization": vectorization,
            "param": "window {}".format(window),
            "f1": round(f1, 2),
            "accuracy": round(accuracy, 2),
        }
        no+=1
        

----------------------------------------------------------------------------------------------------
Skip Gram
Window 2
----------------------------------------------------------------------------------------------------
Epoch 1/10
10/10 - 5s - loss: 1.1001 - accuracy: 0.3170 - val_loss: 1.0998 - val_accuracy: 0.2955 - 5s/epoch - 453ms/step
Epoch 2/10
10/10 - 2s - loss: 1.0990 - accuracy: 0.3187 - val_loss: 1.1016 - val_accuracy: 0.2955 - 2s/epoch - 209ms/step
----------------------------------------------------------------------------------------------------
CBOW
Window 3
----------------------------------------------------------------------------------------------------
Epoch 1/10
10/10 - 5s - loss: 1.0998 - accuracy: 0.3161 - val_loss: 1.0994 - val_accuracy: 0.3505 - 5s/epoch - 514ms/step
Epoch 2/10
10/10 - 2s - loss: 1.0997 - accuracy: 0.3213 - val_loss: 1.0983 - val_accuracy: 0.3540 - 2s/epoch - 201ms/step
---------------------------------------------------------------------------

In [None]:
types = [(CountVectorizer(ngram_range=(1,1)), "unigram", "Count Vectorizer"), 
(CountVectorizer(ngram_range=(1,2)), "unigram - Bigram", "Count Vectorizer"),
(CountVectorizer(ngram_range=(2,2)), "Bigram", "Count Vectorizer"),
(CountVectorizer(ngram_range=(3,3)), "Trigram", "Count Vectorizer"),
(TfidfVectorizer(ngram_range=(1,1)), "unigram", "TF-IDF"),
(TfidfVectorizer(ngram_range=(1,2)), "unigram - Bigram", "TF-IDF"),
(TfidfVectorizer(ngram_range=(3,3)), "Trigram", "TF-IDF")]
for tipe in types:
    print('-'*100)
    print(tipe[2])
    print("Ngram {}".format(tipe[1]))
    print('-'*100)
    vectorizer = tipe[0]
    result_vec = vectorizer.fit_transform(df['cleaned'])
    X = result_vec.toarray()
    y = df_baru[['negatif', 'netral', 'positif']].values
    max_length = max(len(np.nonzero(row)[0]) for row in X)
    padded_sequences = pad_sequences(X, maxlen=max_length, padding='post', dtype='float32')
    # Assuming y_train is your labels
    X = np.array(padded_sequences)
    y = np.array(y)
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_rnn = RNN()
    model_rnn.compile(loss='categorical_crossentropy', optimizer=RMSprop(),metrics=['accuracy'])

    # Train the RNN model
    history = model_rnn.fit(X_train, y_train, batch_size=128, epochs=10, verbose=2,validation_split=0.2, 
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
    loss, accuracy = model_rnn.evaluate(X_test, y_test)
    # Predict labels for test data
    y_pred = model_rnn.predict(X_test)

    # Convert predicted probabilities to class labels
    y_pred_classes = np.argmax(y_pred, axis=1)

    # Convert one-hot encoded labels to single labels
    y_true = np.argmax(y_test, axis=1)

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred_classes, average='weighted')

    result_lstm[no] = {
        "vectorization": tipe[2],
        "param": "{}".format(tipe[1]),
        "f1": round(f1, 2),
        "accuracy": round(accuracy, 2),
    }
    no+=1
        

----------------------------------------------------------------------------------------------------
Count Vectorizer
Ngram unigram
----------------------------------------------------------------------------------------------------
Epoch 1/10
10/10 - 3s - loss: 1.1002 - accuracy: 0.3161 - val_loss: 1.1023 - val_accuracy: 0.2955 - 3s/epoch - 333ms/step
Epoch 2/10
10/10 - 1s - loss: 1.0998 - accuracy: 0.3376 - val_loss: 1.1009 - val_accuracy: 0.2955 - 984ms/epoch - 98ms/step
----------------------------------------------------------------------------------------------------
Count Vectorizer
Ngram unigram - Bigram
----------------------------------------------------------------------------------------------------
Epoch 1/10
10/10 - 4s - loss: 1.0997 - accuracy: 0.3316 - val_loss: 1.0990 - val_accuracy: 0.2955 - 4s/epoch - 433ms/step
Epoch 2/10
10/10 - 2s - loss: 1.0998 - accuracy: 0.3325 - val_loss: 1.1039 - val_accuracy: 0.2955 - 2s/epoch - 181ms/step
-----------------------------------

In [None]:
lstm_result_df = pd.DataFrame(result_lstm).transpose()
lstm_result_df.reset_index(drop=True,inplace=True)
lstm_result_df

Unnamed: 0,vectorization,param,f1,accuracy
0,CBOW,window 2,0.19,0.36
1,CBOW,window 3,0.16,0.33
2,Skip Gram,window 2,0.19,0.36
3,Skip Gram,window 3,0.19,0.36
4,Count Vectorizer,unigram,0.19,0.36
5,Count Vectorizer,unigram - Bigram,0.19,0.36
6,Count Vectorizer,Bigram,0.15,0.32
7,Count Vectorizer,Trigram,0.19,0.36
8,TF-IDF,unigram,0.16,0.33
9,TF-IDF,unigram - Bigram,0.16,0.33


In [None]:
rf_result_df = pd.DataFrame(result).transpose()
rf_result_df.reset_index(drop=True,inplace=True)
rf_result_df

Unnamed: 0,vectorization,param,test size,f1,accuracy
0,Count Vectorizer,unigram,10,0.61,0.61
1,Count Vectorizer,unigram,20,0.59,0.59
2,Count Vectorizer,unigram,30,0.59,0.59
3,Count Vectorizer,unigram,40,0.56,0.56
4,Count Vectorizer,unigram,50,0.54,0.55
...,...,...,...,...,...
85,CBOW,window 5,10,0.57,0.58
86,CBOW,window 5,20,0.55,0.55
87,CBOW,window 5,30,0.55,0.55
88,CBOW,window 5,40,0.55,0.55


## Hyperparameter Tuning

### Random Forest

In [0]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(df['cleaned'])
tfidf_array = tfidf.toarray()
X = pd.DataFrame(data=tfidf_array, columns = vectorizer.get_feature_names_out())
y = df["sentimen"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create a base model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
y_pred = grid_search.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
result_after_hyperparameter["Random Forest"] = {
    "vectorization": "TF-IDF",
    "param": "Unigram - Bigram",
    "f1": round(f1, 2),
    "accuracy": round(accuracy, 2),
}

[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=  18.0s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.6s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.4s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.6s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  11.9s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  16.7s
  warn(
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=

NameError: name 'result_after_hyperparameter' is not defined

### LSTM

In [None]:
def create_model(learning_rate, num_hidden_layers, num_neurons):
    inputs = Input(name='inputs', shape=(X.shape[1],))
    layer = Embedding(input_dim=max_words, output_dim=50, input_length=X.shape[1])(inputs)
    layer = LSTM(64)(layer)
    for i in range(num_hidden_layers):
        layer = Dense(num_neurons)(layer)
        layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(3, name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs, outputs=layer)
    optimizer = RMSprop(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
# Define the objective function to optimize
def objective(learning_rate, num_hidden_layers, num_neurons):
    model = create_model(learning_rate, int(num_hidden_layers), int(num_neurons))
    model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=0)
    val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
    return val_acc

In [None]:
# Define the search space for hyperparameters
pbounds = {'learning_rate': (0.0001, 0.1),
           'num_hidden_layers': (1, 5),
           'num_neurons': (5, 50)}

In [None]:
model = Word2Vec(sentences=df_baru["tokenization"], vector_size=100, window=2, sg=0)
# Function to get document embeddings
def get_doc_embedding(doc):
    embeddings = [model.wv[word] for word in doc if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(100)  # Return zero vector if no embeddings found

# Create document embeddings
doc_embeddings = np.array([get_doc_embedding(doc) for doc in X])
y = df_baru[['negatif', 'netral', 'positif']].values
max_length = max(len(seq) for seq in doc_embeddings)
padded_sequences = pad_sequences(doc_embeddings, maxlen=max_length, padding='post', dtype='float32')
# Assuming y_train is your labels
X = np.array(padded_sequences)
y = np.array(y)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Perform Bayesian optimization
optimizer = BayesianOptimization(f=objective, pbounds=pbounds, verbose=2)
optimizer.maximize(init_points=5, n_iter=20)

In [None]:
# Print the optimized hyperparameters and validation accuracy
print('Optimized hyperparameters:')
print(optimizer.max['params'])
print('Validation accuracy: {:.2f}%'.format(optimizer.max['target'] * 100))

In [None]:
# Predict labels for test data
y_pred = model_rnn.predict(X_test)
# Convert predicted probabilities to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
# Convert one-hot encoded labels to single labels
y_true = np.argmax(y_test, axis=1)
# Calculate F1 score
f1 = f1_score(y_true, y_pred_classes, average='weighted')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b3c5800-c216-4f08-93af-5173ca1bb328' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>