In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

  from numpy.core.umath_tests import inner1d


In [2]:
data = pd.read_csv('data.csv')

In [3]:
train_data, test_data = train_test_split(data, test_size=0.3)

In [4]:
def create_pipeline(classifier):
    bag_of_words = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(2, 3))
    return Pipeline((
        ('bow', bag_of_words),
        ('classifier', classifier),
    ))

In [5]:
def train(name, pipeline):
    print('training with %s...' % name)
    pipeline.fit(train_data.text, train_data.label)
    print('finished')

In [6]:
def evaluate(name, pipeline):
    print('evaluating %s...' % name)
    
    prediction = pipeline.predict(test_data.text)
    cm = confusion_matrix(test_data.label, prediction)
    
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    
    print('specifity', TN / (TN+FP))
    print('precision', TP / (TP+FP))
    print('finished')

In [7]:
def execute(name, classifier):
    pipeline = create_pipeline(classifier)
    train(name, pipeline)
    evaluate(name, pipeline)

In [8]:
execute('multinomial_naive_bayes', MultinomialNB())

training with multinomial_naive_bayes...
finished
evaluating multinomial_naive_bayes...
specifity 0.9551569506726457
precision 0.9812382739212008
finished


In [9]:
execute('svm', LinearSVC())

training with svm...
finished
evaluating svm...
specifity 0.9641255605381166
precision 0.9832635983263598
finished


In [10]:
execute('random_forest', RandomForestClassifier())

training with random_forest...
finished
evaluating random_forest...
specifity 0.9775784753363229
precision 0.9891067538126361
finished


In [11]:
execute('decision_tree', DecisionTreeClassifier())

training with decision_tree...
finished
evaluating decision_tree...
specifity 0.9551569506726457
precision 0.9801980198019802
finished
