In [33]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize



def parsing(sentence):
    """
    Receive a sentence and return it after stemming each word.
    Note that it only works for English.  
    :param a sentence
    :return: a sentence after applying snowball stemmer to each word
    """
    words = word_tokenize(sentence, 'english')
    stemmer = SnowballStemmer('english') # Language setting.

    result = [] #create a empty list
    for w in words:
        ws = stemmer.stem(w)
        result.append(ws) #append stemmed words 
    output = " ".join(result)
    return output



"""
SVM Linear Kernel
"""


def train(text,category):
    """
    It receive train text and category and return the trained model of svm
    Category should be binary: i.e., 0 or 1
    :param text: list of text
    :param category: list of binary category
    :return: trained svm
    """

    # Apply SnowballStemmer to each word
    X_train_vect = []
    for line in text:
        X_train_vect.append(parsing(line))

    # Set pipeline
    pipeline_svm = make_pipeline(CountVectorizer(), TfidfTransformer(), SVC(probability=True, kernel="linear", class_weight="balanced"))
    # Set grid search
    grid_svm = GridSearchCV(pipeline_svm,
                            param_grid={'svc__C': [0.01, 0.1, 1]}, # three C parameter we try
                            cv = 5, # 5 folds. Determines the cross-validation splitting strategy.
                            scoring="roc_auc", # Strategy to evaluate the performance of the cross-validated model on the test set.
                            verbose=1, # Show progress or not (0 : print nothing)
                            n_jobs=-1) # Number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. 
    # Train
    grid_svm.fit(X_train_vect, category)
    return grid_svm


def predict(trained_svm,text):
    """
    Receive trained svm model and list of text and return its prediction
    :param trained_svm:
    :param text: list of text
    :return: prediction in list format
    """
    X_test_vect = []
    for line in text:
        X_test_vect.append(parsing(line))
    # Get prediction using trained model
    pred = trained_svm.predict(X_test_vect)
    return pred

def classification_result(test_category, prediction):
    """
    Print classification report and accuracy score
    :param test_category: test category (true values) in list format
    :param prediction: predicted category in list format
    """
    print(classification_report(test_category,prediction))
    print(accuracy_score(test_category,prediction))





In [19]:
help(word_tokenize)

Help on function word_tokenize in module nltk.tokenize:

word_tokenize(text, language='english', preserve_line=False)
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool



In [28]:
parsing("many dogs are running around")

'mani dog are run around'

In [34]:
# Import data 
import pickle
with open("obama-trump.pkl", "rb") as f:
    data = pickle.load(f)

In [30]:
len(data)

4

In [21]:
print(data[0][0])
words = word_tokenize(data[0][0], 'english')
print(words)

RT @PlaysTrumpCard: @realDonaldTrump We support you because we know your heart is in the right place, and greatly appreciate what an incred…
['RT', '@', 'PlaysTrumpCard', ':', '@', 'realDonaldTrump', 'We', 'support', 'you', 'because', 'we', 'know', 'your', 'heart', 'is', 'in', 'the', 'right', 'place', ',', 'and', 'greatly', 'appreciate', 'what', 'an', 'incred…']


In [26]:
stemmer = SnowballStemmer('english') # Language setting.
stemmer.stem("beauty")


'beauti'

In [4]:
print(len(data[0]),len(data[1]), len(data[2]), len(data[3]))

2000 2000 1000 1000


In [10]:
data[2][0:4]

('LIVE: President Obama is participating in a #SXSW2016 panel on civic engagement and technology. https://t.co/BBoIZW0LWP',
 'RT @marklevinshow: More on my opening \nhttps://t.co/nnp5r2hhU5',
 'Find out how @OFA organizers are gearing up to #ActOnClimate in their communities—and how you can support this work: https://t.co/OIa4ryAdw0',
 'This grassroots movement is aiming high for 2016. You can help by pitching in: https://t.co/5MGYREoabS https://t.co/FcRRnjrjPk')

In [35]:
train_t = data[0]
train_c = data[1]
test_t = data[2]
test_c = data[3]

In [32]:
train_t

('RT @PlaysTrumpCard: @realDonaldTrump We support you because we know your heart is in the right place, and greatly appreciate what an incred…',
 "Don't let the facts get drowned out. Join the OFA Truth Team today: https://t.co/RgBRNLYdYl https://t.co/SN8HUXw2H4",
 "Judge Merrick Garland just received the American Bar Association's highest rating: https://t.co/9ew2WSpFqd",
 'Proud to cheer on Team USA at the Invictus Games today with my friend Joe. You represent the best of our country. https://t.co/WBzcltmgqj',
 'When someone shares their story, we see the world through their eyes. I’m looking forward to hearing a few from leaders around the world and sharing my own at the @ObamaFoundation Summit in Chicago. Tune in at https://t.co/GYkEOK8EuT. https://t.co/sOllDsDA1Z',
 'RT @WhiteHouse: "What I want the people of Louisiana to know is this—you’re not alone."—@POTUS: https://t.co/6Uet2HpPXI https://t.co/d5yjHs…',
 'RT @KatrinaPierson: What did Americans learn from tonight’s #DemDebate? 

In [36]:
# SVM
trained = train(train_t, train_c)


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    9.7s finished


In [39]:
test_t[0]

'LIVE: President Obama is participating in a #SXSW2016 panel on civic engagement and technology. https://t.co/BBoIZW0LWP'

In [40]:
pred = predict(trained, test_t)
print(pred)


[0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1
 0 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0
 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 1
 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1
 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 0
 1 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0
 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1
 1 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0
 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 0 0 1 1 0 1 1
 0 1 1 0 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 1 0
 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0
 1 1 0 1 0 1 1 0 0 1 0 0 

In [16]:
print(test_c)

(0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 

In [41]:
classification_result(test_c,pred)


              precision    recall  f1-score   support

           0       0.93      0.94      0.93       500
           1       0.94      0.93      0.93       500

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.93      0.93      0.93      1000

0.934
