### LSTM

In [13]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving contentTranslated.csv to contentTranslated.csv
User uploaded file "contentTranslated.csv" with length 626868 bytes


In [0]:
!pip install -q keras
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [0]:
data = pd.read_csv('contentTranslated.csv')


In [15]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,0.0,"Section 499 of the Indian Penal Code, 1860 is ...",,,,
1,,1.0,The fact is that the police crews in the vehic...,,,,
2,,0.0,No responsibility for the country or the people.,,,,
3,,0.0,Ambedkar said that the son of a hired son was ...,,,,
4,,0.0,Everyone should recognize that terrorism that ...,,,,


In [0]:
data = data[['text','sentiment']]

In [0]:
#data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

#print(data[ data['sentiment'] == 'Positive'].size)
#print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [23]:
from keras import metrics

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy', 'mae'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 61, 128)           256000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 61, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3624, 61) (3624, 2)
(1786, 61) (1786, 2)


In [25]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)



Instructions for updating:
Use tf.cast instead.
Epoch 1/7
 - 19s - loss: 0.5905 - acc: 0.7225 - mean_absolute_error: 0.4036
Epoch 2/7
 - 16s - loss: 0.5090 - acc: 0.7639 - mean_absolute_error: 0.3438
Epoch 3/7
 - 16s - loss: 0.4310 - acc: 0.8100 - mean_absolute_error: 0.2792
Epoch 4/7
 - 16s - loss: 0.3618 - acc: 0.8486 - mean_absolute_error: 0.2339
Epoch 5/7
 - 16s - loss: 0.3014 - acc: 0.8710 - mean_absolute_error: 0.1926
Epoch 6/7
 - 16s - loss: 0.2603 - acc: 0.8967 - mean_absolute_error: 0.1645
Epoch 7/7
 - 16s - loss: 0.2164 - acc: 0.9135 - mean_absolute_error: 0.1373


<keras.callbacks.History at 0x7fb268064b00>

In [0]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.74
acc: 0.74


In [0]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 34.229828850855746 %
neg_acc 85.51787351054078 %


### Testing on loss = categorical cross entropy

In [0]:
!pip install -q keras
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [0]:
data = pd.read_csv('contentTranslated.csv')


In [0]:
data.head()

In [0]:
data = data[['text','sentiment']]

In [0]:
#data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

#print(data[ data['sentiment'] == 'Positive'].size)
#print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [0]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 61, 128)           256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 61, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3624, 61) (3624, 2)
(1786, 61) (1786, 2)


In [0]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)



Epoch 1/7
 - 18s - loss: 0.5926 - acc: 0.7210
Epoch 2/7
 - 17s - loss: 0.5101 - acc: 0.7566
Epoch 3/7
 - 17s - loss: 0.4238 - acc: 0.8157
Epoch 4/7
 - 17s - loss: 0.3553 - acc: 0.8471
Epoch 5/7
 - 17s - loss: 0.2974 - acc: 0.8778
Epoch 6/7
 - 17s - loss: 0.2517 - acc: 0.8962
Epoch 7/7
 - 17s - loss: 0.2104 - acc: 0.9183


<keras.callbacks.History at 0x7f23fb725550>

In [0]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.83
acc: 0.71


In [0]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 39.119804400978 %
neg_acc 81.11824014665444 %


### GRU

In [0]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('contentTranslated.csv')
df.head()

In [None]:
!pip install -q nltk
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize

reviews = df.review.str.cat(sep=' ')

#function to split text into word
tokens = word_tokenize(reviews)

vocabulary = set(tokens)
print(len(vocabulary))

frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=1600, height=800, max_font_size=200)#.generate(frequency_dist)
wordcloud.generate_from_frequencies(frequency_dist)
#Pavan Phd, [18.02.19 19:35]
plt.figure(figsize=(16,10))
#generate(frequency_dist)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb_model = lb.fit_transform([-1, 0 , 1])
y = df.iloc[:, 1].values
y = y.reshape((y.shape[0], 1))
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean' , axis = 0)
imputer = imputer.fit(y)
y = imputer.transform(y)
y = y.ravel()
x = df.iloc[:, 2].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=.5, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

In [None]:
y_train =y_train.astype('int') 


In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(train_vectors, y_train)

In [None]:
from  sklearn.metrics  import accuracy_score

predicted = clf.predict(test_vectors)

#print(accuracy_score(y_test,predicted))

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews)

In [None]:
max_length = max([len(s.split()) for s in total_reviews])
vocab_size = len(tokenizer_obj.word_index) + 1

In [None]:
X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

In [None]:
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100 

model = Sequential()


In [None]:
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()

In [None]:
print('Train')
model.fit(X_train_pad,y_train, batch_size=48, epochs=2, validation_data=(X_test_pad,y_test),verbose=2)

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
review_lines = list()
lines = df['review'].values.tolist()

In [None]:
for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table  = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words  = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words  = [w for w in words if not w in stop_words]
    review_lines.append(words)
    

In [None]:
len(review_lines)

In [None]:
import gensim
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
words = list(model.wv.vocab)
print('vocabulary size: %d' % len(words))

In [None]:
model.wv.most_similar('responsibility')

In [None]:
model.wv.most_similar_cosmul(positive=['fact', 'son'], negative=['tragedy'])

In [None]:
print(model.wv.doesnt_match("woman king queen movie".split()))

In [None]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)


word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen= max_length)
sentiment = df['sentiment'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)


In [None]:
import numpy as np
num_words = len(word_index) + 1
embedding_matrix  = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
print(num_words)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words, 
                            EMBEDDING_DIM, 
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length, 
                            trainable = False)

model.add(embedding_layer)
model.add(GRU(units=32, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [None]:
VALIDATION_SPLIT = .2

indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]


In [None]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad_tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

In [None]:
print('Train.....')

model.fit(X_train_pad, y_train, batch_size=30, epochs=3, validation_data=(X_test_pad, y_test), verbose=2)

### SVM

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
data = pd.read_csv('contentTranslated.csv')
data.head()

In [None]:
x = data['message']
y = data['class']
y = y.fillna(0)
y = y.astype(int)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.2, random_state=1)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [None]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(analyzer = 'word', tokenizer = tokenize, lowercase = True, ngram_range=(1, 1),stop_words = en_stopwords)

In [None]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                        param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                        cv = kfolds,
                        scoring="roc_auc",
                        verbose=1,   
                        n_jobs=-1) 

grid_svm.fit(x_train, y_train)
grid_svm.score(x_test, y_test)

In [None]:
grid_svm.best_params_
grid_svm.best_score_

In [None]:
def report_results(model, x, y):
    pred_proba = model.predict_proba(x)[:, 1]
    pred = model.predict(x)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result



In [None]:
report_results(grid_svm.best_estimator_, x_test, y_test)

In [None]:
def get_roc_curve(model, x, y):
    pred_proba = model.predict_proba(x)[:, 1]
    fpr, tpr, _ = roc_curve(y, pred_proba)
    return fpr, tpr

In [None]:
roc_svm = get_roc_curve(grid_svm.best_estimator_, x_test, y_test)

In [None]:
fpr, tpr = roc_svm
plt.figure(figsize=(14,8))
plt.plot(fpr, tpr, color="red")
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Roc curve')
plt.show()


In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(grid_svm.best_estimator_, x_train, y_train, cv=5, n_jobs=-1, 
                                                        scoring="roc_auc", train_sizes=np.linspace(.1, 1.0, 10), random_state=1)

In [None]:
def plot_learning_curve(x, y, train_sizes, train_scores, test_scores, title='', ylim=None, figsize=(14,8)):

    plt.figure(figsize=figsize)
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="lower right")
    return plt

In [None]:
plot_learning_curve(x_train, y_train, train_sizes, train_scores, test_scores, ylim=(0.7, 1.01), figsize=(14,6))
plt.show()

### Log Reg

In [None]:
df = pd.read_csv('contentTranslated.csv')
df.head()

In [None]:
df = df.drop(labels=['Unnamed: 0', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], axis=1)
x = df['message']
y = df['class']
y = y.fillna(0)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
bag = count.fit_transform(x)

count.vocabulary_

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)

np.set_printoptions(precision=2)

# Feed the tf-idf transformer with our previously created Bag of Words
tfidf.fit_transform(bag).toarray()

In [None]:
from collections import Counter

vocab = Counter()
for twit in x:
    for word in twit.split(' '):
        vocab[word] += 1

vocab.most_common(20)

In [None]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [None]:
import math

def plot_distribution(vocabulary):

    hist, edges = np.histogram(list(map(lambda x:math.log(x[1]),vocabulary.most_common())), density=True, bins=500)

    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title="Word distribution accross all twits")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555", )
    show(p)

plot_distribution(vocab)



In [None]:
import nltk

from nltk.corpus import stopwords
stop = stopwords.words('english')

vocab_reduced = Counter()
for w, c in vocab.items():
    if not w in stop:
        vocab_reduced[w]=c

vocab_reduced.most_common(20)

In [None]:
plot_distribution(vocab_reduced)

In [None]:
import re

def preprocessor(text):
    """ Return a cleaned version of text"""
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

print(preprocessor('This!! twit man :) is <b>nice</b>'))

In [None]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#print(tokenizer('Hi there, I am loving this, like with a lot of love'))
#print(tokenizer_porter('Hi there, I am loving this, like with a lot of love'))

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset in train and test
#X = train['SentimentText']
#y = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: ' + str(gs_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % clf.score(X_test, y_test))

In [None]:
pred = clf.predict(X_test)

In [None]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(x)[:, 1]
    pred = model.predict(X)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import classification_report,average_precision_score,precision_recall_curve,precision_score,recall_score,f1_score

In [None]:
pprf = precision_recall_fscore_support(y_test, pred, average='macro')
print("log reg precision_recall_fscore_support ", pprf)
pps1 = precision_score(y_test, pred, labels=None, pos_label=1, average='macro', sample_weight=None)
print("log reg precision_score -> %.2f"%pps1)
prs1 = recall_score(y_test, pred, labels=None, pos_label=1, average='macro', sample_weight=None)
print("log reg recall_score -> %.2f"%prs1)
pf11=f1_score(y_test, pred, labels=None, pos_label=1, average='macro', sample_weight=None)
print("f1_score",f1_score(y_test, pred, labels=None, pos_label=1, average='macro', sample_weight=None))
print('log reg f1 score -> %.2f'%pf11)