# Data Labeling Classification

## Import Library

In [1]:
from tqdm import tqdm
import pandas as pd
import transformers
from transformers import pipeline
import matplotlib.pyplot as plt
import ast

  from .autonotebook import tqdm as notebook_tqdm


## Data

### Load CSV

In [2]:
youtube_dataset = pd.read_csv("../data/youtube-comment-cleaned.csv")
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed
0,1 POIN PERDANA!!!! Berapa Nilai Untuk Pertandi...,poin perdana berapa nilai untuk pertandingan t...,"['poin', 'perdana', 'berapa', 'nilai', 'untuk'...","['poin', 'perdana', 'nilai', 'pertandingan', '...","['poin', 'perdana', 'nilai', 'tanding', 'timna...",poin perdana nilai tanding timnas senior hadap...
1,GarudaQ sekarang sekelas argentina..vietnam bs...,garudaq sekarang sekelas argentina vietnam bis...,"['garudaq', 'sekarang', 'sekelas', 'argentina'...","['garudaq', 'sekelas', 'argentina', 'vietnam',...","['garudaq', 'kelas', 'argentina', 'vietnam', '...",garudaq kelas argentina vietnam kalahkn timnas...
2,Padahal mau lihat mancini di gbk tapi eh di pe...,padahal mau lihat mancini di gbk tapi eh di pe...,"['padahal', 'mau', 'lihat', 'mancini', 'di', '...","['lihat', 'mancini', 'gbk', 'eh', 'pecat']","['lihat', 'mancini', 'gbk', 'eh', 'pecat']",lihat mancini gbk eh pecat
3,Pffsoz,pffsoz,['pffsoz'],['pffsoz'],['pffsoz'],pffsoz
4,shin busukk indo laos 33 busuuuukkk,shin busukk indo laos busuukk,"['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']",shin busukk indo laos busuukk


### Cleaning Confirmation

In [3]:
youtube_dataset = youtube_dataset.dropna()
youtube_dataset.isnull().sum()

comment              0
cleaned_comment      0
tokenized_comment    0
stopwords_comment    0
stemmed_comment      0
cleaned_stemmed      0
dtype: int64

## Classification Using Bigger Model

### Define Variable and Function

In [4]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True,
    truncation=True
)

def sentiment_analysis(text):
    result = distilled_student_sentiment_classifier(text)
    highest_score_label = max(result[0], key=lambda x: x['score'])['label']
    return highest_score_label

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Apply Function

In [5]:
youtube_dataset["sentiment_prediction"] = [sentiment_analysis(comment) for comment in tqdm(youtube_dataset['cleaned_stemmed'])]

youtube_dataset.head()

100%|██████████| 21664/21664 [14:19<00:00, 25.19it/s]


Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,1 POIN PERDANA!!!! Berapa Nilai Untuk Pertandi...,poin perdana berapa nilai untuk pertandingan t...,"['poin', 'perdana', 'berapa', 'nilai', 'untuk'...","['poin', 'perdana', 'nilai', 'pertandingan', '...","['poin', 'perdana', 'nilai', 'tanding', 'timna...",poin perdana nilai tanding timnas senior hadap...,positive
1,GarudaQ sekarang sekelas argentina..vietnam bs...,garudaq sekarang sekelas argentina vietnam bis...,"['garudaq', 'sekarang', 'sekelas', 'argentina'...","['garudaq', 'sekelas', 'argentina', 'vietnam',...","['garudaq', 'kelas', 'argentina', 'vietnam', '...",garudaq kelas argentina vietnam kalahkn timnas...,negative
2,Padahal mau lihat mancini di gbk tapi eh di pe...,padahal mau lihat mancini di gbk tapi eh di pe...,"['padahal', 'mau', 'lihat', 'mancini', 'di', '...","['lihat', 'mancini', 'gbk', 'eh', 'pecat']","['lihat', 'mancini', 'gbk', 'eh', 'pecat']",lihat mancini gbk eh pecat,negative
3,Pffsoz,pffsoz,['pffsoz'],['pffsoz'],['pffsoz'],pffsoz,positive
4,shin busukk indo laos 33 busuuuukkk,shin busukk indo laos busuukk,"['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']",shin busukk indo laos busuukk,negative


## Save Classification CSV

In [6]:
youtube_dataset.to_csv("../data/youtube-comment-cleaned-sentiment.csv", index=False)
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,1 POIN PERDANA!!!! Berapa Nilai Untuk Pertandi...,poin perdana berapa nilai untuk pertandingan t...,"['poin', 'perdana', 'berapa', 'nilai', 'untuk'...","['poin', 'perdana', 'nilai', 'pertandingan', '...","['poin', 'perdana', 'nilai', 'tanding', 'timna...",poin perdana nilai tanding timnas senior hadap...,positive
1,GarudaQ sekarang sekelas argentina..vietnam bs...,garudaq sekarang sekelas argentina vietnam bis...,"['garudaq', 'sekarang', 'sekelas', 'argentina'...","['garudaq', 'sekelas', 'argentina', 'vietnam',...","['garudaq', 'kelas', 'argentina', 'vietnam', '...",garudaq kelas argentina vietnam kalahkn timnas...,negative
2,Padahal mau lihat mancini di gbk tapi eh di pe...,padahal mau lihat mancini di gbk tapi eh di pe...,"['padahal', 'mau', 'lihat', 'mancini', 'di', '...","['lihat', 'mancini', 'gbk', 'eh', 'pecat']","['lihat', 'mancini', 'gbk', 'eh', 'pecat']",lihat mancini gbk eh pecat,negative
3,Pffsoz,pffsoz,['pffsoz'],['pffsoz'],['pffsoz'],pffsoz,positive
4,shin busukk indo laos 33 busuuuukkk,shin busukk indo laos busuukk,"['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']","['shin', 'busukk', 'indo', 'laos', 'busuukk']",shin busukk indo laos busuukk,negative


### Balance Sentiment Data

In [7]:
check_sentiment = youtube_dataset["sentiment_prediction"].value_counts()
min_count = check_sentiment.min()
dataset_balanced = youtube_dataset.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
print(dataset_balanced["sentiment_prediction"].value_counts())

sentiment_prediction
negative    145
neutral     145
positive    145
Name: count, dtype: int64


  dataset_balanced = youtube_dataset.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [8]:
dataset_balanced.to_csv("../data/youtube-comment-cleaned-sentiment-balanced.csv", index=False)

### Reformating Column

In [9]:
youtube_dataset_clean = youtube_dataset[["cleaned_stemmed", "sentiment_prediction"]]
youtube_dataset_clean.head()

Unnamed: 0,cleaned_stemmed,sentiment_prediction
0,poin perdana nilai tanding timnas senior hadap...,positive
1,garudaq kelas argentina vietnam kalahkn timnas...,negative
2,lihat mancini gbk eh pecat,negative
3,pffsoz,positive
4,shin busukk indo laos busuukk,negative


In [10]:
youtube_dataset_clean.to_csv("../data/youtube-comment-cleaned-sentiment-reformat.csv", index=False)

### Balance Data After Reformat

In [11]:
check_sentiment = youtube_dataset_clean["sentiment_prediction"].value_counts()
min_count = check_sentiment.min()
dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
print(dataset_balanced["sentiment_prediction"].value_counts())

sentiment_prediction
negative    145
neutral     145
positive    145
Name: count, dtype: int64


  dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [12]:
dataset_balanced.to_csv("../data/youtube-comment-cleaned-sentiment-reformat-balanced.csv", index=False)