In [63]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [64]:
df = pd.read_csv('Train Processing.csv')
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
Positive = df.loc[df['label'] == 1, 'text'].copy().reset_index(drop=True)
Negative = df.loc[df['label'] == 0, 'text'].copy().reset_index(drop=True)

In [65]:
len(Positive), len(Negative)

(19981, 20019)

In [66]:
vect = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)


In [67]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, df['label'].values, test_size=0.2, random_state=0)

In [68]:
len(X_train), len(X_test), len(y_train), len(y_test)

(32000, 8000, 32000, 8000)

In [69]:
tfidf_train = vect.fit_transform(X_train)
tfidf_test = vect.transform(X_test)

Pada random forest, parameter yang diobservasi adalah n_estimator, max depth, dan max_features. 

Dibawah ini menggunakan n_estimators=200, max_depth=20 dan max_features= auto.

In [76]:
clf1 = RandomForestClassifier(n_estimators=200, max_depth=20, bootstrap=True, class_weight=None, criterion='gini',
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
clf1 = clf1.fit(tfidf_train, y_train)
preds1 = clf1.predict(tfidf_test)
print ('Accuracy: {}'.format(accuracy_score (y_test, preds1)*100) + "%")
print ('Precision: {}'.format(precision_score(y_test, preds1, average='weighted')*100) + "%")
print ('Recall: {}'.format(recall_score(y_test, preds1, average='weighted')*100) + "%")

Accuracy: 82.4375%
Precision: 82.56765488016423%
Recall: 82.4375%


Dibawah ini menggunakan n_estimators=100, max_depth=50 dan max_features= sqrt

In [42]:
clf2 = RandomForestClassifier(n_estimators=100, max_depth=50, bootstrap=True, class_weight=None, criterion='gini',
            max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
clf2 = clf2.fit(tfidf_train, y_train)
preds2 = clf2.predict(tfidf_test)
print ('Accuracy: {}'.format(accuracy_score (y_test, preds2)*100) + "%")
print ('Precision: {}'.format(precision_score(y_test, preds2, average='weighted')*100) + "%")
print ('Recall: {}'.format(recall_score(y_test, preds2, average='weighted')*100) + "%")

Accuracy: 83.1875%
Precision: 83.19223838170983%
Recall: 83.1875%


Dibawah ini menggunakan n_estimators=200, max_depth=100 dan max_features= log2

In [43]:
clf3 = RandomForestClassifier(n_estimators=200, random_state=0, max_depth=100, bootstrap=True, class_weight=None, criterion='gini',
            max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_jobs=1,
            oob_score=False, verbose=0,
            warm_start=False)
clf3 = clf3.fit(tfidf_train, y_train)
preds3 = clf3.predict(tfidf_test)
print ('Accuracy: {}'.format(accuracy_score (y_test, preds3)*100) + "%")
print ('Precision: {}'.format(precision_score(y_test, preds3, average='weighted')*100) + "%")
print ('Recall: {}'.format(recall_score(y_test, preds3, average='weighted')*100) + "%")

Accuracy: 84.55%
Precision: 84.54979167082202%
Recall: 84.55%
