# Preprocessing dengan NLP

## Importing the libraries

In [1]:
# Importing the libraries (bobot 2)
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
# Importing the dataset (10 poin)
dataset = pd.read_csv('tweets.csv', delimiter = ',')

In [3]:
dataset

Unnamed: 0,Sentiment,Tweet
0,0,betapa bejad kader gerindra yang anggota dprd ...
1,0,kamar nya sempit tidak ada tempat menyimpan ba...
2,0,jangan percaya dengan agama buddha . agama itu...
3,0,yusri seorang ibu yang tidak tahu diri . sudah...
4,0,menemukan bocah-bocah sampai yang komentar di ...
...,...,...
1123,1,harga makanan di sini menurut saya di atas rat...
1124,1,restoran ini bisa jadi alternatif pilihan saat...
1125,1,sering ke sini kalau punya tamu-tamu dari luar...
1126,1,batagor kingsley ini enak ! tidak banyak tepun...


## Cleaning the texts

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
# Specifying number of dataset to be read equals variable nidn (10 poin)
for i in range(0, 1000):
    # Removing numbers and symbols (10 poin)
    review = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][i])
    # Casefolding (5 poin)
    review = review.lower() 
    # Tokenizing (5 poin)  
    review = review.split()
    # Stemming with NLTK (5 poin)
    ps = PorterStemmer()
    # Filtering (stopword removal) (10 poin)
    all_stopwords = stopwords.words('indonesian')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
corpus

['betapa bejad kader gerindra anggota dprd mencabuli anak smp rakyat cerda partai tengelamkan',
 'kamar nya sempit menyimpan barang menambah barang by the way kipa diletakkan uchiwa seged',
 'percaya agama buddha agama tuhan agama sesat',
 'yusri uang mebel muka dikaca dibantu terima kasih dasar male',
 'menemukan bocah bocah komentar yutub pakai kotor mengajarkan dah sedih',
 'indomaret penjahat anyar dunia perdagangan',
 'tren si tukang fitnah bicara iman dasar akun sapi takut kehilangan kursi cawapr prabowo busukkan nama ahi say',
 'modalin sediakan carikan pemb sindir warga zaman now bergantung pemerintah banget',
 'musnahkan lgbt gay homo lesbian muka bumi ancaman terbesar umat manusia dibumi dibiarkan manusia punah',
 'gue bingung pemerintahan jokowi orang muji merusak negara aneh',
 'ampera benhil pegawai jude asap rokok kotor cabang ampera mengecewakan',
 'profesion institusi kecewa rapi bermain main rapi kotor',
 'jokwi presiden simbol simbol pki tumbuh subur henti berdaya mem

In [6]:
all_stopwords

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

## Creating the Bag of Words model

In [7]:
# Creating the Bag of Words model (20 poin)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:1000, 0].values

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Splitting the dataset into the Training set and Test set

In [10]:
# Splitting the dataset into the Training set and Test set (5 poin)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [11]:
# Training the Naive Bayes model on the Training set Gaussian Naive Bayes (5 poin)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [12]:
# Predicting the Test set results (3 poin)
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [13]:
# Making the Confusion Matrix and print the accuracy (5 poin)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 64  10]
 [ 21 105]]


0.845