# Preprocessing dengan NLP

## Importing the libraries

In [91]:
# Importing the libraries (bobot 2)
import numpy as np
import pandas as pd

## Importing the dataset

In [92]:
# Importing the dataset (10 poin)
dataset = pd.read_csv('comment.csv', delimiter = ',')

In [93]:
dataset

Unnamed: 0,Sentimen,Comment
0,0,"Makin jelek aja anaknya, padahal ibu ayahnya c..."
1,0,Jijik astagah sok bat cakep njs
2,1,Kaya tulus ya mukanya klo diliat2
3,0,"Jadi diri sendiri, jangan jadi ular apalagi du..."
4,0,Aduuh.. terlalu halu.. hhahaha lebay editan ny...
...,...,...
1045,1,Bangga sama suami yg selalu ingat istri disela...
1046,1,Apaoun pekerjaannya yg penting halal u tuk men...
1047,1,Gojek itu mayoritas pegangguran yang lama gak ...
1048,1,<USERNAME> aslinya cantik dan ayu loh mbak kr...


## Cleaning the texts

In [94]:
custom_stopwords = []
with open("custom_stopwords.txt", "r") as file:
    for line in file:
        custom_stopwords.append(line.strip())

In [95]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
        
corpus = []
# Specifying number of dataset to be read equals variable nidn (10 poin)
for i in range(0, 1000):
    # Removing numbers and symbols (10 poin)
    review = re.sub('[^a-zA-Z]', ' ', dataset['Comment'][i])
    # Casefolding (5 poin) 
    review = review.lower()
    # Tokenizing (5 poin)
    review = review.split()
    # Stemming with NLTK (5 poin)
    ps = PorterStemmer()
    # Filtering (stopword removal) (10 poin)
    ind_stopwords = stopwords.words('indonesian')
    eng_stopwords = stopwords.words('english')
    all_stopwords = ind_stopwords + eng_stopwords + custom_stopwords
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
corpus

['jelek ayahnya cakep',
 'jijik cakep nj',
 'tulu diliat',
 'ular',
 'aduuh editan nyaa lemak kemana lemak',
 'nikmatnya menghirup oksigen beba',
 'amin',
 'good vibe',
 'kakinya gedong sayyy jantan upp berbaur warwerwor mahluk astral kakinya gedong',
 'kukira kehadiranmu kebanggaan',
 'cocok biar hancur',
 'poto post',
 'vidio bareng sutan kayanya seneng',
 'ganteng kagak kagak kelebihannya cobak',
 'sampah',
 'yakinin niatnya bg',
 'sehat rejeki tr lancar',
 'warna taek babi',
 'plu hamil gini modelannya',
 'hitam putih prose',
 'diedit ketiak hitam',
 'cowo liatin lidah jam jijik minu gini',
 'hidupnya',
 'anj anj berwujud anj perwujudan',
 'benci bro tingkah melenceng',
 'jatohnya rikkk',
 'bulli ngeri ngerii ajgg',
 'support hate bgst',
 'rambutnya modd bagussss item berkilauu',
 'anj ibunya nangi',
 'muka kampung gaya soan kota nyet',
 'mukalu aktor bokep bgst ngobat luu',
 'diedit biar langs mah bomber badannya',
 'cakep',
 'cocoknya pengusaha kulin fix',
 'randi pangalila gk pa

In [97]:
all_stopwords

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

## Creating the Bag of Words model

In [98]:
# Creating the Bag of Words model (20 poin)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:1000, 0].values

## Splitting the dataset into the Training set and Test set

In [99]:
# Splitting the dataset into the Training set and Test set (5 poin)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [18]:
# Training the Naive Bayes model on the Training set Gaussian Naive Bayes (5 poin)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Training the Logistic Regression model on the Training set

In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

## Training the K-NN model on the Training set

In [75]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Training the Kernel SVM model on the Training set

In [100]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Training the Decision Tree Classification model on the Training set

In [51]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Training the Random Forest Classification model on the Training set

In [63]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [101]:
# Predicting the Test set results (3 poin)
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 0]

## Making the Confusion Matrix

In [1]:
# Making the Confusion Matrix and print the accuracy (5 poin)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

NameError: name 'y_test' is not defined

In [29]:
Naive Bayes model : 0.805
Logistic Regression model : 0.855
K-NN model : 0.675
Kernel SVM model : 0.825 
Decision Tree Classification model : 0.8
Random Forest Classification model : 0.83

SyntaxError: invalid syntax (2654490863.py, line 1)

In [106]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = SVC(random_state = 0)

eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
    voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Kernel SVM', 'Ensemble']):
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.95 (+/- 0.04) [Logistic Regression]
Accuracy: 0.94 (+/- 0.04) [Random Forest]
Accuracy: 0.95 (+/- 0.04) [Kernel SVM]
Accuracy: 0.95 (+/- 0.04) [Ensemble]
