In [73]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pickle

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
np.random.seed(500)

In [75]:
Corpus = pd.read_csv("/content/drive/MyDrive/TA/svm_nb_train/bukhari_nr.csv", encoding='latin-1')

In [76]:
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


In [77]:
def text_preprocessing(text):
    # Mengubah semua teks menjadi lower case
    text = text.lower()

    # Melakukan tokenisasi
    text_words_list = word_tokenize(text)

    # Menghapus stopword dan melakukan lematisasi
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()

    for word, tag in pos_tag(text_words_list):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            Final_words.append(word_Final)
    return str(Final_words)

In [78]:
Corpus['text_final'] = Corpus['indo'].map(text_preprocessing)

In [79]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['kitabId'],
                                                                    test_size=0.2)

In [80]:
# Label encode variabel target 
Encoder = LabelEncoder()
Encoder.fit(Train_Y)
Train_Y = Encoder.transform(Train_Y)
Test_Y = Encoder.transform(Test_Y)

In [81]:
# Vectorize kata menggunakan TF-IDF Vectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [82]:
# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf, Train_Y)

MultinomialNB()

In [83]:
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

In [84]:
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, Test_Y) * 100)

Naive Bayes Accuracy Score ->  67.76034236804564


In [85]:
print("Naive Bayes Precision Score -> ", precision_score(Test_Y, predictions_NB, average='macro'))

Naive Bayes Precision Score ->  0.7128281985875065


In [86]:
print("Naive Bayes Recall Score -> ", recall_score(Test_Y, predictions_NB, average='macro'))

Naive Bayes Recall Score ->  0.6899769754359398


In [87]:
print("Naive Bayes F1 Score -> ", f1_score(Test_Y, predictions_NB, average='macro'))

Naive Bayes F1 Score ->  0.6568364248794872


In [88]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)

SVC(gamma='auto', kernel='linear')

In [89]:
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [90]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y) * 100)

SVM Accuracy Score ->  79.95720399429386


In [91]:
print("SVM Precision Score -> ", precision_score(Test_Y, predictions_SVM, average='macro'))

SVM Precision Score ->  0.7961988408764923


In [92]:
print("SVM Recall Score -> ", recall_score(Test_Y, predictions_SVM, average='macro'))

SVM Recall Score ->  0.797298193613771


In [93]:
print("SVM F1 Score -> ", f1_score(Test_Y, predictions_SVM, average='macro'))

SVM F1 Score ->  0.7897564370601488


In [47]:
# Predict Data

In [95]:
hadits_data_loc = [
    '/content/drive/MyDrive/TA/json_data/abudaud.json',
    '/content/drive/MyDrive/TA/json_data/darimi.json',
    '/content/drive/MyDrive/TA/json_data/ibnumajah.json',
    '/content/drive/MyDrive/TA/json_data/malik.json',
    '/content/drive/MyDrive/TA/json_data/muslim.json',
    '/content/drive/MyDrive/TA/json_data/nasai.json',
    '/content/drive/MyDrive/TA/json_data/tirmidzi.json'
]
hadits_ns_loc = [
    '/content/drive/MyDrive/TA/noSanad/abudaud.csv',
    '/content/drive/MyDrive/TA/noSanad/darimi.csv',
    '/content/drive/MyDrive/TA/noSanad/ibnumajah.csv',
    '/content/drive/MyDrive/TA/noSanad/malik.csv',
    '/content/drive/MyDrive/TA/noSanad/muslim.csv',
    '/content/drive/MyDrive/TA/noSanad/nasai.csv',
    '/content/drive/MyDrive/TA/noSanad/tirmidzi.csv'
]
hadits_nr_loc = [
    '/content/drive/MyDrive/TA/noSanad_name/abudaud.csv',
    '/content/drive/MyDrive/TA/noSanad_name/darimi.csv',
    '/content/drive/MyDrive/TA/noSanad_name/ibnumajah.csv',
    '/content/drive/MyDrive/TA/noSanad_name/malik.csv',
    '/content/drive/MyDrive/TA/noSanad_name/muslim.csv',
    '/content/drive/MyDrive/TA/noSanad_name/nasai.csv',
    '/content/drive/MyDrive/TA/noSanad_name/tirmidzi.csv'
]
hadits_ns_save_loc = [
    '/content/drive/MyDrive/TA/svm_nb_result/ns/abudaud.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/darimi.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/ibnumajah.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/malik.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/muslim.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/nasai.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/ns/tirmidzi.csv'
]
hadits_nr_save_loc = [
    '/content/drive/MyDrive/TA/svm_nb_result/nr/abudaud.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/darimi.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/ibnumajah.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/malik.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/muslim.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/nasai.csv',
    '/content/drive/MyDrive/TA/svm_nb_result/nr/tirmidzi.csv'
]

In [98]:
def reading_data_result(data_file):
    data_file = pd.read_json(data_file)
    data_file = data_file.sort_values(by=['haditsId'])
    data_file = data_file.reset_index()
    data_file = data_file[["haditsId" , "kitabId","indo", "arab" ]]
    return data_file

for i in tqdm(range(len(hadits_nr_loc))):
  Corpus = pd.read_csv(hadits_nr_loc[i], encoding='latin-1')
  Corpus['text_final'] = Corpus['indo'].map(text_preprocessing)
  Train_X = Corpus['text_final']
  Train_Y = Corpus['kitabId']

  Encoder = LabelEncoder()
  Encoder.fit(Train_Y)
  Train_Y = Encoder.transform(Train_Y)
  Tfidf_vect = TfidfVectorizer(max_features=5000)
  Tfidf_vect.fit(Corpus['text_final'])
  Train_X_Tfidf = Tfidf_vect.transform(Train_X)

  # Naive Bayes
  predictions_NB = Naive.predict(Train_X_Tfidf)
  final = reading_data_result(hadits_data_loc[i])
  final['label'] = predictions_NB
  file_output = hadits_nr_save_loc[i]
  final.to_csv(file_output, index=False)

  # SVM
  predictions_SVM = SVM.predict(Train_X_Tfidf)
  final = reading_data_result(hadits_data_loc[i])
  final['label'] = predictions_SVM
  file_output = hadits_nr_save_loc[i]
  final.to_csv(file_output, index=False)


100%|██████████| 7/7 [07:33<00:00, 64.77s/it]


In [44]:
df = pd.DataFrame({'svm':predictions_SVM,'nb':predictions_NB}, columns = ['svm', 'nb'])

In [46]:
df.groupby('svm').describe()

Unnamed: 0_level_0,nb,nb,nb,nb,nb,nb,nb,nb
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
svm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,22.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0
1,19.0,15.894737,19.040585,1.0,1.00,1.0,39.5,44.0
2,19.0,19.000000,20.537229,2.0,2.00,2.0,42.0,57.0
3,15.0,17.200000,20.227986,3.0,3.00,4.0,34.5,51.0
4,23.0,6.043478,8.059316,4.0,4.00,4.0,4.0,42.0
...,...,...,...,...,...,...,...,...
72,28.0,52.071429,20.241008,6.0,43.00,51.0,72.0,76.0
73,24.0,58.333333,25.964301,13.0,59.25,73.0,73.0,73.0
74,22.0,64.181818,19.701671,14.0,74.00,74.0,74.0,74.0
75,12.0,45.333333,23.046725,10.0,25.00,43.0,69.0,75.0
