# Support Vector Machine using Linear Kernel

In [1]:
import graphlab as gl
data = gl.SFrame.read_csv("/Users/swastika.b/Documents/Swastika/SantaClaraEdu/courses/Machine Learning/Project/FinalCode/training_data.csv",verbose= False)
test_data = gl.SFrame.read_csv("/Users/swastika.b/Documents/Swastika/SantaClaraEdu/courses/Machine Learning/Project/FinalCode/train.csv",verbose= False)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1503109689.log


This non-commercial license of GraphLab Create for academic use is assigned to sbhat1@scu.edu and will expire on August 12, 2018.


# Preprocess Data

In [2]:
def transform_text(text):
    
    import re
    import string
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()
    
    lower_case_text = text.translate(None, string.punctuation).lower()
    words_list = word_tokenize(lower_case_text)
    text2 = []
    for word in words_list:
        text0 = word.decode('ascii', 'ignore')
        text1 = regex.sub(u'', text0)
        if not text1 == u'':
            if not text1 in stopwords.words('english'):
                text2.append(porter.stem(text1))
    return text2
    

data['comment_clean'] = data['Comment'].apply(transform_text)

In [3]:
def create_text(words):
    return ' '.join(words)

data['comment_string'] = data['comment_clean'].apply(create_text)

def remove_punctuation(text):
    import string
    lower_case_text = text.translate(None, string.punctuation).lower()
    return lower_case_text
    
#data['comment_string'] = data['Comment'].apply(remove_punctuation)

# Train the data

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def split_into_lemmas(comments):
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 8), token_pattern=r'\b\w+\b', min_df=1)
    analyze = bigram_vectorizer.build_analyzer()
    return analyze(comments)

vectorizer = CountVectorizer(analyzer=split_into_lemmas,stop_words='english',strip_accents='ascii').fit(data['comment_string'])
text_transformed = vectorizer.transform(data['comment_string'])
tfidf_transformer = TfidfTransformer().fit(text_transformed)
tfidf_transformed_text = tfidf_transformer.transform(text_transformed)

In [5]:
from sklearn.svm import LinearSVC
#classifier_nb = LinearSVC(random_state=0, penalty="l1", dual=False).fit(tfidf_transformed_text, data['Insult'])
classifier_nb = LinearSVC(random_state=0).fit(tfidf_transformed_text, data['Insult'])

# K - Fold Cross Validation

In [6]:
from sklearn.cross_validation import cross_val_predict
from sklearn import metrics
import numpy as np
y = (data['Insult']).to_numpy()
predicted = cross_val_predict(classifier_nb, tfidf_transformed_text, y, cv=10)
metrics.accuracy_score(y, predicted) 



0.7051454138702461

# Evaluation on Test Data

In [7]:
test_data['comment_clean'] = test_data['Comment'].apply(transform_text)
test_data['comment_string'] = test_data['comment_clean'].apply(create_text)

In [8]:
test_text_transformed = vectorizer.transform(test_data['comment_string'])
tfidf_transformed_test_text = tfidf_transformer.transform(test_text_transformed)
predicted_class = classifier_nb.predict(tfidf_transformed_test_text)

In [9]:
import numpy as np
true_class = np.array(test_data['Insult'])
from sklearn.metrics import accuracy_score
accuracy_score(true_class, predicted_class)

0.69014441347859135

# Test on Sample Data

In [10]:
sample_test_data = test_data[16:17]
sample_test_data.select_columns(['Insult','Comment','comment_string'])

Insult,Comment,comment_string
1,"""You with the 'racist' screen name\n\nYou are a ...",racist screen name pieceofshit ...


In [11]:
print sample_test_data['Comment']
sample_test_data['comment_string']

['"You with the 'racist' screen name

You are a PieceOfShit.........."']


dtype: str
Rows: 1
['racist screen name pieceofshit']

In [12]:
sample_test_data = ["You’re a moron, truth is beyond your reach", "I’ll take that temp...I really hate the heat"]
tokenized_sample_data = []
for text in sample_test_data:
    tokenized_sample_data.append(transform_text(text))
string_sample_data = []
for text in tokenized_sample_data:
    string_sample_data.append(create_text(text))
sample_text_transformed = vectorizer.transform(string_sample_data)

In [13]:
#sample_text_transformed = vectorizer.transform(sample_test_data['comment_string'])
tfidf_transformed_sample_text = tfidf_transformer.transform(sample_text_transformed)
predicted_class = classifier_nb.predict(tfidf_transformed_sample_text)
classifier_nb.predict(tfidf_transformed_sample_text)

array([1, 1])