In [None]:
import pandas as pd
import numpy as np
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import string
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
# read the 3 text and split x and y in each one of them
data_imdb=pd.read_csv('imdb_labelled.txt',sep='\t',header=None,quoting=3)
data_yelp=pd.read_csv('yelp_labelled.txt',sep='\t',header=None,quoting=3)
data_amazon = pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None,quoting=3)

data_imdb_x=data_imdb.iloc[:,0]
data_imdb_y=data_imdb.iloc[:,1]

data_yelp_x=data_yelp.iloc[:,0]
data_yelp_y=data_yelp.iloc[:,1]

data_amazon_x=data_amazon.iloc[:,0]
data_amazon_y=data_amazon.iloc[:,1]

In [None]:
print(data_imdb.shape)
print(data_yelp.shape)
print(data_amazon.shape)

In [None]:
# function for text preprocess
def textPreprocess(inputText):
    stop_words = set(stopwords.words('english'))
    ps = nltk.stem.PorterStemmer()
    procText=[]
    for i in range(0,inputText.shape[0]):
        #text = ps.stem(inputText[i])
        text = inputText[i]
        # split sentence in tokens
        tokens = word_tokenize(text)
        # stemming
        tokens = [ps.stem(w) for w in tokens]
        #print(tokens)
        # take words not cleaned
        words = [w for w in tokens]
        #remove stop words
        words = [w for w in words if not w in stop_words]
        #remove punctuation
        #words = [w for w in tokens if w.isalpha()]
        table = str.maketrans('', '', string.punctuation)
        words = [w.translate(table) for w in words]
        words=[x for x in words if x]
        procText.append(words)
    return procText


data_imdb_x_proc = textPreprocess(data_imdb_x)
data_yelp_x_proc = textPreprocess(data_yelp_x)
data_amazon_x_proc = textPreprocess(data_amazon_x)



In [None]:
data_x = data_imdb_x_proc+data_yelp_x_proc+data_amazon_x_proc
data_y = np.concatenate((data_imdb_y.values,  data_yelp_y.values, data_amazon_y.values))


In [None]:
#find unique words and sort them
def uniqueSortedElement(inputList):
    partial_unique=list()
    for i in range(len(inputList)):
        partial_unique=partial_unique+list(set(inputList[i]))

    unique_data = list(set(partial_unique))    
    unique_data = sorted(unique_data)
    return unique_data


In [None]:
# create vocabulary for features
voc=uniqueSortedElement(data_x)
len(voc)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


def final_vector(dataset,vocab):
    vectorizer = CountVectorizer(vocabulary=vocab, ngram_range=(1, 1)) # n_gram
    sentences_vectors = np.zeros([len(dataset),len(voc)])
    for i in range(1,len(dataset)):
        temp = vectorizer.transform([" ".join(dataset[i])])
        sentences_vectors[i,]=temp.toarray()
    return sentences_vectors


data_vector = final_vector(data_x,voc)
data_vector.shape

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(data_vector,data_y,test_size=0.2,random_state=2019,shuffle=True,stratify=data_y)

#scaler = MinMaxScaler().fit(X_train)
#X_train = scaler.transform(X_train)
#X_test= scaler.transform(X_test)

In [None]:
#Cs = [2**(-3),2**(-2),2**(-1), 1,2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9),2**(10),2**(11),2**(12),2**(13),2**(14),2**(15)]
#gammas = [2**(-15),2**(-14),2**(-13),2**(-12),2**(-11),2**(-10),2**(-9),2**(-8),2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),2**(-1),2**(0),2**(1),2**(2),2**(3)]
cv=StratifiedKFold(n_splits=10).split(X_train, y_train)

Cs = [2**(-3),2**(-2),2**(-1), 1,2**(1),2**(2),2**(3),2**4,2**5]
gammas = [2**(-3),2**(-2),2**(-1),2**(0),2**(1),2**(2),2**3]

parameters = {'C':Cs,'gamma':gammas}
svc = svm.SVC( kernel='rbf',decision_function_shape='ovo')
clf = GridSearchCV(svc, parameters, cv=cv,scoring='accuracy',verbose=2,n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
#save results_
with open('out3.txt', 'w') as f:
    print(clf.cv_results_, file=f)
    
with open('out_values_per_pair.txt3', 'w') as f:
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params), file=f)
        #print(clf.cv_results_, file=f)

In [None]:
print("The best parameters are %s with a score of %f" % (clf.best_params_, clf.best_score_))

In [None]:
#svc2 = svm.SVC( kernel='rbf',C=1000,gamma=0.0001)
svc2 = svm.SVC( kernel='rbf',C=2,gamma=0.125)
svc2.fit(X_train, y_train)
y_hat = svc2.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_hat)
print("SVM accuracy: %f" % (acc))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_hat))
print("num of vectors per class",svc2.n_support_)
print("sum of vectors: ",np.sum(svc2.n_support_))