In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize




def parsing(sentence):
    """
    Receive a sentence and return it after stemming each word.
    Note that it only works for English.  
    :param a sentence
    :return: a sentence after applying snowball stemmer to each word
    """
    words = word_tokenize(sentence,'english')
    stemmer = SnowballStemmer('english') # Language setting.

    result = []
    for w in words:
        ws = stemmer.stem(w)
        result.append(ws)
    output = " ".join(result)
    return output



"""
SVM Linear Kernel
"""


def train(text,category):
    """
    It receive train text and category and return the trained model of svm
    Category should be binary: i.e., 0 or 1
    :param text: list of text
    :param category: list of binary category
    :return: trained svm
    """

    # Apply SnowballStemmer to each word
    X_train_vect = []
    for line in text:
        X_train_vect.append(parsing(line))

    # Set pipeline
    pipeline_svm = make_pipeline(CountVectorizer(), TfidfTransformer(), SVC(probability=True, kernel="linear", class_weight="balanced"))
    # Set grid search
    grid_svm = GridSearchCV(pipeline_svm,
                            param_grid={'svc__C': [0.01, 0.1, 1]}, # three C parameter we try
                            cv = 5, # 5 folds. Determines the cross-validation splitting strategy.
                            scoring="roc_auc", # Strategy to evaluate the performance of the cross-validated model on the test set.
                            verbose=1, # Show progress or not (0 : print nothing)
                            n_jobs=-1) # Number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. 
    # Train
    grid_svm.fit(X_train_vect, category)
    return grid_svm


def predict(trained_svm,text):
    """
    Receive trained svm model and list of text and return its prediction
    :param trained_svm:
    :param text: list of text
    :return: prediction in list format
    """
    X_test_vect = []
    for line in text:
        X_test_vect.append(parsing(line))
    # Get prediction using trained model
    pred = trained_svm.predict(X_test_vect)
    return pred

def classification_result(test_category, prediction):
    """
    Print classification report and accuracy score
    :param test_category: test category (true values) in list format
    :param prediction: predicted category in list format
    """
    print(classification_report(test_category,prediction))
    print(accuracy_score(test_category,prediction))





In [7]:
parsing("many dogs are running around")

'mani dog are run around'

In [4]:
# Import data 
import pickle
with open("obama-trump.pkl", "rb") as f:
    data = pickle.load(f)

In [5]:
len(data)

4

In [9]:
print(len(data[0]),len(data[1]), len(data[2]), len(data[3]))

2000 2000 1000 1000


In [13]:
train_t = data[0]
train_c = data[1]
test_t = data[2]
test_c = data[3]

In [14]:
# SVM
trained = train(train_t, train_c)
pred = predict(trained, test_t)
classification_result(test_c,pred)


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    6.7s finished


              precision    recall  f1-score   support

           0       0.93      0.94      0.93       500
           1       0.94      0.93      0.93       500

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.93      0.93      0.93      1000

0.934
