In [None]:
import pandas as pd
import joblib
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from gensim.models import Word2Vec

### Read Cleaned Dataset

In [None]:
df = pd.read_csv('./Preprocessed/cleaned_data.csv')
df.head()

Unnamed: 0,sentimen,tweet
0,negatif,kata prabowo indonesia tidak harga bangsa asin...
1,netral,batu langka tasbih jokowi hadiah dari habib lu...
2,netral,di era jokowi ekonomi indonesia makin baik ind...
3,positif,bagi sumatera selatan asi games dampak pd ekon...
4,negatif,negara kita ngutang buat bngun infrastruktur y...


In [None]:
PATH = './Dataset/'

## Frequency-Based Vectorization

### Count Vector

In [None]:
def count_vectorize(dataset, path):
    tweets = dataset['tweet']
    # Initialize CountVectorizer without applying stop words
    vectorizer = CountVectorizer(stop_words=None)
    # Perform text vectorization
    X = vectorizer.fit_transform(tweets)
    # Create a DataFrame from the feature matrix with the feature names as columns
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    # Concatenate the original dataset with the vectorized DataFrame
    new_dataset = pd.concat([dataset, vectorized_df], axis=1)

    new_dataset.drop(columns='tweet', axis=1, inplace=True)

    save_path = os.path.join(path, 'count_vector')
    os.makedirs(save_path, exist_ok=True)

    # Dump the vectorizer to a file using joblib
    vectorizer_filename = os.path.join(save_path, 'count_vectorizer.joblib')
    joblib.dump(vectorizer, vectorizer_filename)

    # Save the new dataset to a CSV file
    dataset_filename = os.path.join(save_path, 'count_vectorizer.csv')
    new_dataset.to_csv(dataset_filename, index=False)

    return new_dataset

In [None]:
count_vector_df = count_vectorize(df, PATH)
count_vector_df.shape

(1815, 5185)

In [None]:
count_vector_df.head()

Unnamed: 0,sentimen,aa,aamiin,aamiinn,aamin,aammiin,abadi,abah,abai,abang,...,yuuk,zaenal,zakat,zaman,zer,zero,ziarah,zippo,zon,zonk
0,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [None]:
def tf_idf(dataset, path):
    tweets = dataset['tweet']
    # Initialize TfidfVectorizer without applying stop words
    vectorizer = TfidfVectorizer(stop_words=None)
    # Perform text vectorization
    X = vectorizer.fit_transform(tweets)
    # Create a DataFrame from the feature matrix with the feature names as columns
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    # Concatenate the original dataset with the vectorized DataFrame
    new_dataset = pd.concat([dataset, vectorized_df], axis=1)

    new_dataset.drop(columns='tweet', axis=1, inplace=True)

    save_path = os.path.join(path, 'tfidf')
    os.makedirs(save_path, exist_ok=True)

    # Dump the vectorizer to a file using joblib
    vectorizer_filename = os.path.join(save_path, 'tfidf_vectorizer.joblib')
    joblib.dump(vectorizer, vectorizer_filename)

    # Save the new dataset to a CSV file
    dataset_filename = os.path.join(save_path, 'tfidf_vectorized_dataset.csv')
    new_dataset.to_csv(dataset_filename, index=False)

    return new_dataset

In [None]:
tfidf_df = tf_idf(df, PATH)
tfidf_df.shape

(1815, 5185)

In [None]:
tfidf_df.head()

Unnamed: 0,sentimen,aa,aamiin,aamiinn,aamin,aammiin,abadi,abah,abai,abang,...,yuuk,zaenal,zakat,zaman,zer,zero,ziarah,zippo,zon,zonk
0,negatif,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,netral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,netral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,positif,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,negatif,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Co-Occurence Matrix

In [None]:
coo_matrix_df = df.copy()

In [None]:
def split_text(text):
    return text.split()

def co_occurrence_matrix(dataset, path):
    tweets = dataset['tweet']
    sentiment_column = dataset['sentimen']  # Get the sentiment column
    # Initialize CountVectorizer without applying stop words
    vectorizer = CountVectorizer(tokenizer=split_text, stop_words=None)
    # Perform text vectorization
    X = vectorizer.fit_transform(tweets)
    # Get the vocabulary (list of words)
    vocab = vectorizer.get_feature_names_out()
    # Calculate the co-occurrence matrix
    co_occurrence = X.T.dot(X)
    # Create a DataFrame from the co-occurrence matrix with words as both index and columns
    co_occurrence_df = pd.DataFrame(co_occurrence.toarray(), columns=vocab)

    # Vectorize embeddings
    tweet_vectors = []
    for tweet in dataset['tweet']:
        vector = np.zeros(len(vocab))  # Initialize vector for the tweet
        for word in tweet.split():
            if word in vocab:  # Check if word is in the vocabulary
                vector += co_occurrence_df[word].values  # Add word vector to tweet vector
        tweet_vectors.append(vector)

    # Add the vectorized embedding as a new column in the dataset
    dataset['tweet'] = tweet_vectors

    save_path = os.path.join(path, 'coo_matrix')
    os.makedirs(save_path, exist_ok=True)

    # Save the dataset including the vectorized embeddings
    dataset_filename = os.path.join(save_path, 'coo_dataset.csv')
    dataset.to_csv(dataset_filename, index=False)

    # Save the co-occurrence matrix DataFrame to a CSV file
    co_occurrence_filename = os.path.join(save_path, 'coo_matrix.csv')
    co_occurrence_df.to_csv(co_occurrence_filename, index=False)  # Set index to False to exclude the index column

    # Save the CountVectorizer using Joblib
    vectorizer_filename = os.path.join(save_path, 'coo_matrix_vectorizer.joblib')
    joblib.dump(vectorizer, vectorizer_filename)

    return dataset

In [None]:
coo_matrix_df = co_occurrence_matrix(coo_matrix_df, PATH)
coo_matrix_df.shape

(1815, 2)

In [None]:
coo_matrix_df.head()

Unnamed: 0,sentimen,tweet
0,negatif,"[15.0, 20.0, 1.0, 2.0, 3.0, 8.0, 0.0, 2.0, 3.0..."
1,netral,"[3.0, 1.0, 0.0, 0.0, 0.0, 3.0, 2.0, 1.0, 1.0, ..."
2,netral,"[18.0, 16.0, 3.0, 4.0, 2.0, 6.0, 11.0, 5.0, 1...."
3,positif,"[5.0, 6.0, 2.0, 1.0, 1.0, 3.0, 5.0, 4.0, 0.0, ..."
4,negatif,"[11.0, 19.0, 1.0, 2.0, 1.0, 12.0, 2.0, 7.0, 4...."


### N-Gram

In [None]:
def ngram_vectorize(dataset, path, ngram_type='unigram'):
    tweets = dataset['tweet']
    # Determine the n-gram range based on the specified n-gram type
    if ngram_type == 'unigram':
        ngram_range = (1, 1)
    elif ngram_type == 'bigram':
        ngram_range = (1, 2)
    elif ngram_type == 'trigram':
        ngram_range = (1, 3)
    else:
        raise ValueError("Invalid ngram_type. Must be 'unigram', 'bigram', or 'trigram'.")
    
    # Initialize CountVectorizer without applying stop words
    vectorizer = CountVectorizer(stop_words=None, ngram_range=ngram_range)
    # Perform text vectorization
    X = vectorizer.fit_transform(tweets)
    # Create a DataFrame from the feature matrix with the feature names as columns
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    # Concatenate the original dataset with the vectorized DataFrame
    new_dataset = pd.concat([dataset, vectorized_df], axis=1)

    new_dataset.drop(columns='tweet', axis=1, inplace=True)

    save_path = os.path.join(path, 'n_gram')
    os.makedirs(save_path, exist_ok=True)

    # Dump the vectorizer to a file using joblib
    vectorizer_filename = os.path.join(save_path, f'{ngram_type}_vectorizer.joblib')
    joblib.dump(vectorizer, vectorizer_filename)

    # Save the new dataset to a CSV file
    dataset_filename = os.path.join(save_path, f'{ngram_type}_vectorized_dataset.csv')
    new_dataset.to_csv(dataset_filename, index=False)

    return new_dataset

In [None]:
unigram_df = ngram_vectorize(df, PATH, ngram_type='unigram')
bigram_df = ngram_vectorize(df, PATH, ngram_type='bigram')
trigram_df = ngram_vectorize(df, PATH, ngram_type='trigram')

In [None]:
unigram_df.head()

Unnamed: 0,sentimen,aa,aamiin,aamiinn,aamin,aammiin,abadi,abah,abai,abang,...,yuuk,zaenal,zakat,zaman,zer,zero,ziarah,zippo,zon,zonk
0,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
bigram_df.head()

Unnamed: 0,sentimen,aa,aa gym,aa kita,aamiin,aamiin prabowo,aamiin the,aamiin ya,aamiinn,aamin,...,ziarah kubur,zippo,zippo pilih,zon,zon bantah,zon itu,zon kaya,zon prabowo,zonk,zonk dpr
0,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
trigram_df.head()

Unnamed: 0,sentimen,aa,aa gym,aa gym pilih,aa kita,aa kita mantap,aamiin,aamiin prabowo,aamiin prabowo sandiuno,aamiin the,...,zon bantah nyata,zon itu,zon itu sulit,zon kaya,zon kaya emak,zon prabowo,zon prabowo hanya,zonk,zonk dpr,zonk dpr kita
0,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,netral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,positif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,negatif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Prediction-Based Vectorization

### C-Bow

In [None]:
cbow_df = df.copy()

In [None]:
def cbow(df, path, vector_size=100, window=5):
    # Obtain corpus from DataFrame
    corpus = [tweet.split() for tweet in df['tweet']]  # Tokenize each tweet
    
    # Train CBOW Word2Vec model
    cbow_model = Word2Vec(sentences=corpus, vector_size=vector_size, window=window, sg=0)
    
    # Vectorize tweets using CBOW model
    tweet_vectors = []
    for tweet in corpus:
        vector = np.zeros(cbow_model.vector_size)  # Initialize vector for the tweet
        count = 0
        for word in tweet:
            if word in cbow_model.wv:  # Check if word is in the vocabulary
                vector += cbow_model.wv[word]  # Add word vector to tweet vector
                count += 1
        if count != 0:
            vector /= count  # Average the word vectors
        tweet_vectors.append(vector)

    # Add the tweet vectors to the DataFrame
    df['tweet'] = tweet_vectors

    save_path = os.path.join(path, 'cbow')
    os.makedirs(save_path, exist_ok=True)

    # Save the trained CBOW model
    model_filename = os.path.join(save_path, f'cbow_model.joblib')
    joblib.dump(cbow_model, model_filename)
    
    # Save the DataFrame to a CSV file
    dataset_filename = os.path.join(save_path, f'cbow.csv')
    df.to_csv(dataset_filename, index=False)

    return df

In [None]:
cbow_df = cbow(cbow_df, PATH)
cbow_df.head()

Unnamed: 0,sentimen,tweet
0,negatif,"[-0.17692965045571327, 0.34841369887193047, 0...."
1,netral,"[-0.2364843487739563, 0.4504990776379903, 0.43..."
2,netral,"[-0.1864237571756045, 0.4020713925361633, 0.36..."
3,positif,"[-0.13527522821511542, 0.2696121409535408, 0.2..."
4,negatif,"[-0.17235149359030108, 0.3391154577895518, 0.3..."


### Skip Gram

In [None]:
skipgram_df = df.copy()

In [None]:
def skipgram(df, path, vector_size=100, window=5):
    # Obtain corpus from DataFrame
    corpus = [tweet.split() for tweet in df['tweet']]  # Tokenize each tweet
    
    # Train Skip-gram Word2Vec model
    skipgram_model = Word2Vec(sentences=corpus, vector_size=vector_size, window=window, sg=1)
    
    # Vectorize tweets using Skip-gram model
    tweet_vectors = []
    for tweet in corpus:
        vectors = [skipgram_model.wv[word] for word in tweet if word in skipgram_model.wv]  # Generate word vectors
        tweet_vectors.append(np.mean(vectors, axis=0))  # Average word vectors for the tweet

    # Add the tweet vectors to the DataFrame
    df['tweet'] = tweet_vectors

    # Create directory for saving files
    save_path = os.path.join(path, 'skipgram')
    os.makedirs(save_path, exist_ok=True)

    # Save the trained Skip-gram model
    model_filename = os.path.join(save_path, 'skipgram_model.joblib')
    joblib.dump(skipgram_model, model_filename)
    
    # Save the DataFrame to a CSV file
    dataset_filename = os.path.join(save_path, 'skipgram.csv')
    df.to_csv(dataset_filename, index=False)

    return df


In [None]:
skipgram_df = skipgram(skipgram_df, PATH)
skipgram_df.head()

Unnamed: 0,sentimen,tweet
0,negatif,"[-0.08032813, 0.15281476, 0.15728903, 0.087458..."
1,netral,"[-0.12832697, 0.13469094, 0.23886347, 0.156421..."
2,netral,"[-0.010187586, 0.19339877, 0.12789437, 0.08436..."
3,positif,"[-0.06410887, 0.15075667, 0.13999256, 0.080862..."
4,negatif,"[-0.09163668, 0.15062967, 0.1592201, 0.1026617..."


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b3c5800-c216-4f08-93af-5173ca1bb328' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>