# Preprocessing dengan NLP

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
dataset = pd.read_csv('tweets.csv', delimiter = ',')

In [None]:
dataset

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
# Specifying number of dataset to be read equals variable nidn (10 poin)
for i in range(0, 1000):
    # Removing numbers and symbols (10 poin) 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][i])
    print(review)
    # Casefolding (5 poin) 
    review = review.lower()
    print(review)
    # Tokenizing (5 poin)
    review = review.split()
    print(review)
    # Stemming with NLTK (5 poin)
    ps = PorterStemmer()
    # Filtering (stopword removal) (10 poin)
    all_stopwords = stopwords.words('indonesian')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
all_stopwords

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:1000, 0].values

In [None]:
X

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Machine Learning Model Selection

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, precisions_score, f1_score

Select_model = ["Naive bayes","Logistic Regression","K-NN","SVM","Kernel SVM","Decision Tree Classification","Random Forest Classification"]
scoring = ['f1_macro', 'precision_macro', 'recall_macro']

for model in Select_model :
    if model == "Naive Bayes" :
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
    elif model == "Logistic Regression" : 
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = 0)
        classifier.fit(X_train, y_train)
    elif model =="K-NN" : 
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
    elif model =="SVM" : 
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
    elif model =="Kernel SVM" : 
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
    elif model =="Decision Tree Classification" : 
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
    elif model =="Random Forest Classification" : 
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        
        #classifier.fit(X_train, y_train)
        scores = cross_validate(classifier, x, y cv=5, scoring= scoring)
        sorted(scores.keys())
        
        
        #models = model
    #prediciting the test results
    #y_pred = classifier.predict(X_test)
    #print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F-score: {f1}")
    
    #from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
#cm = confusion_matrix(y_test, y_pred)
#print(cm)
#Evaluation
#accuracy = accuracy_score(y_test, y_pred)
#precision = precision_score(y_test, y_pred)
#recall = recall_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred)

precision = scores['test_precision_macro'].mean()
recall = scores['test_recall_macro'].mean()
f1 = scores['test_f1_macro'].mean()
    
labels = ['Precision','Recall','F1-score']
values = [ precision, recall, f1]
plt.barh(labels, values)
 
for index, value in enumerate(values):
    plt.text(value, index, str("{:.3f}".format(value)))
    
plt.title(f'Evaluasi {model} Model')
plt.show ()
               
    
        
        
    

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

## Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Training the SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

## Training the Kernel SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
#Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F-score: {f1}")

In [None]:
labels = ['Accuracy','Precision','Recall','F1-score']
values = [accuracy, precision, recall, f1]
plt.barh(labels, values)
 
for index, value in enumerate(values):
    plt.text(value, index, str("{:.3f}".format(value)))
    
plt.title('Evaluasi Random Forest Model')
plt.show ()