In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns

import re 
import nltk 

from nltk import sent_tokenize, word_tokenize
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from time import time

# from sklearn import svm
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
import pandas as pd
import io
  
df = pd.read_csv(r'C:\Users\Acer\Desktop\NLP project/train.csv')
print(df)

Preview of CLASS in the dataset

In [None]:
from matplotlib import pyplot as plt

plt.style.use('fivethirtyeight')

tot = df.shape[0]
num_toxic = df[df.CLASS == 0].shape[0]

slices = [num_toxic/tot,(tot - num_toxic)/tot]
labeling = ['Non-Advertise Content','Advertise Content']
explode = [0.2,0]
plt.figure(figsize=(4,8))
plt.pie(slices,explode=explode,shadow=True,autopct='%1.1f%%',labels=labeling,wedgeprops={'edgecolor':'black'})
plt.tight_layout()
plt.show()

In [None]:
df['length_train'] = df['CONTENT'].str.len()

In [None]:
combi = df.append(df,ignore_index=True)

In [None]:
def remove_pattern(input_txt,pattern):
  r= re.findall(pattern,input_txt)
  for i in r:
    input_txt = re.sub(i,'',input_txt)
  return input_txt

In [None]:
#removes @user
combi['tidy_content'] = np.vectorize(remove_pattern)(combi['CONTENT'],"@[\w]*")
#removes extra letters 
combi['tidy_content'] = combi['tidy_content'].str.replace("[^a-zA-z#]"," ")
#removes all those words with size less than 3 
combi['tidy_content']= combi['tidy_content'].apply(lambda x : ' '.join([w for w in x.split() if len(w)>3])) 

In [None]:
combi.sample(n=10)

In [None]:
tokenized_tweet = combi['tidy_content'].apply(lambda x: x.split()) #creates a list

In [None]:
tokenized_tweet.head()

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])

In [None]:
for i in range(len(tokenized_tweet)):
  tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
combi['tidy_tweet'] = tokenized_tweet

In [None]:
combi.head()

In [None]:
# splitting data 

X_train, X_test, y_train, y_test = train_test_split(df['CONTENT'], 
                                                    df['CLASS'], 
                                                    random_state=42)  
           
print('Number of rows in the total set: {}'.format(df.shape))
print('Number of rows in the training set: {}'.format(X_train.shape))
print('Number of rows in the test set: {}'.format(X_test.shape))

In [None]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer(stop_words = 'english', lowercase = True)

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
print(X_train.shape)
print(training_data.shape)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

# making pickle file
import pickle
filename = 'vector_vocabulary.pkl'
pickle.dump(count_vector.vocabulary_, open(filename, 'wb'))

In [None]:
def pipeline(learner_list, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    # Get length of Training Data:
    size = len(y_train)
    
    results = {}
    final_results = []
    
    import pickle
    for learner in learner_list:
        
        print(learner)
        # Store the learner name:
        results['Algorithm'] = learner.__class__.__name__

        # Fit the learner:
        start = time() # Get start time
        print("Training {}".format(learner.__class__.__name__))
        learner = learner.fit(X_train, y_train)
        end = time() # Get end time

        # making pickle file here for my various classifier

        filename = learner.__class__.__name__+".pkl"
        pickle.dump(learner, open(filename, 'wb'))


        # Store the training time
        results['Training Time'] = end - start

        start = time() # Get start time
        predictions_test = learner.predict(X_test)
        predictions_train = learner.predict(X_train)
        end = time() # Get end time

        # Store the prediction time
        results['Prediction Time'] = end - start

        # Compute the Accuracy on Test Set
        results['Accuracy: Test'] = accuracy_score(y_test, predictions_test)

        # Compute the Accuracy on Training Set
        results['Accuracy: Train'] = accuracy_score(y_train, predictions_train)

        # Compute the F1 Score on Test Set
        results['F1 Score: Test'] = f1_score(y_test, predictions_test)

        # Compute the F1 Score on Training Set
        results['F1 Score: Train'] = f1_score(y_train, predictions_train)

        # Compute the Precision on Test Set
        results['Precision: Test'] = precision_score(y_test, predictions_test)

        # Compute the Precision on Training Set
        results['Precision: Train'] = precision_score(y_train, predictions_train)

        # Compute the Recall on Test Set
        results['Recall: Test'] = recall_score(y_test, predictions_test)

        # Compute the Recall on Training Set
        results['Recall: Train'] = recall_score(y_train, predictions_train)

        # Success
        print("Training {} finished in {:.2f} sec".format(learner.__class__.__name__, results['Training Time']))
        print('----------------------------------------------------')
        
        final_results.append(results.copy())
    # Return a dataframe of the results
    return final_results

In [None]:
# make a list of models
models = [LinearSVC(), 
          RandomForestClassifier(),
         LogisticRegression(), KNeighborsClassifier()]

In [None]:
re = pipeline(models, training_data, y_train, testing_data, y_test)
results = pd.DataFrame(re)
results = results.reindex(columns = ['Algorithm', 'Accuracy: Test', 'Precision: Test', 'Recall: Test', 'F1 Score: Test', 'Prediction Time',
                          'Accuracy: Train', 'Precision: Train', 'Recall: Train', 'F1 Score: Train', 'Training Time'])

In [None]:
results = results.reindex(columns = ['Algorithm', 'Accuracy: Test', 'Precision: Test', 'Recall: Test', 'F1 Score: Test', 'Prediction Time',
                          'Accuracy: Train', 'Precision: Train', 'Recall: Train', 'F1 Score: Train', 'Training Time'])

results.sort_values(by = 'F1 Score: Test', inplace = True, ascending = False)

In [None]:
results.reset_index(drop = True)

In [None]:
results.describe().loc[['min', 'max'], :]

In [None]:
best_acc = results[results['Accuracy: Test'] == results['Accuracy: Test'].max()]
best_f1 = results[results['F1 Score: Test'] == results['F1 Score: Test'].max()]
best_precision = results[results['Precision: Test'] == results['Precision: Test'].max()]
best_recall = results[results['Recall: Test'] == results['Recall: Test'].max()]

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize = (10, 5))

barWidth = 0.2
 
# set height of bar
bars1 = results['Accuracy: Test']
bars2 = results['F1 Score: Test']
bars3 = results['Precision: Test']
bars4 = results['Recall: Test']

 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

 
# Make the plot
pal = sns.color_palette()
plt.bar(r1, bars1, color= pal[0], width=barWidth, edgecolor='white', label='Test Accuracy')
plt.bar(r2, bars2, color= pal[1], width=barWidth, edgecolor='white', label='F1 Score')
plt.bar(r3, bars3, color= pal[2], width=barWidth, edgecolor='white', label='Precision')
plt.bar(r4, bars4, color= pal[4], width=barWidth, edgecolor='white', label='Recall')

 
# Add xticks on the middle of the group bars
plt.xlabel('Algorithm', fontweight='bold', fontsize = 15)
plt.ylabel('Score', fontweight = 'bold', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(bars1))], results['Algorithm'], rotation = 15, fontsize = 12)
 
# Create legend & Show graphic
plt.legend(fontsize = 12)

textstr = '\n'.join(['Best Accuracy: {:.3f} - {}'.format(best_acc['Accuracy: Test'].values[0], best_acc['Algorithm'].values[0]), 
                     'Best F1 Score: {:.3f} - {}'.format(best_f1['F1 Score: Test'].values[0], best_f1['Algorithm'].values[0]),
                   'Best Precision: {:.3f} - {}'.format(best_precision['Precision: Test'].values[0], best_precision['Algorithm'].values[0]), 
                    'Best Recall: {:.3f} - {}'.format(best_recall['Recall: Test'].values[0], best_recall['Algorithm'].values[0])])
props = dict(boxstyle='round', facecolor='lightgrey', alpha=0.5)

#place a text box
plt.text(9.2, 1, textstr, fontsize=12,
        verticalalignment='top', bbox=props)

plt.title('Classification Summary of Algorithms', fontweight = 'bold', fontsize = 17);
print("click to zoom")

In [None]:
# I am testing my classifier and pre-processing stages

data = ["hello"]
count_vector = CountVectorizer(stop_words = 'english', lowercase = True,vocabulary=pickle.load(open("vector_vocabulary.pkl", "rb")))
data=count_vector.transform(data)
print(data)

In [None]:
import pickle
trained_model=pickle.load(open("LinearSVC.pkl", 'rb'))

# print(testing_data)
print(trained_model.coef_.shape)

# print(X_test[0])
# print(testing_data[0][0])

print(trained_model.predict(data))

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test


In [None]:
test1 = test[['ID', 'CONTENT']]

In [None]:
test1

In [None]:
op= []
#value = combi['CONTENT'].apply(lambda x:x.split())
for i in test1['CONTENT']:
  
  data = [i]
  count_vector = CountVectorizer(stop_words = 'english', lowercase = True,vocabulary=pickle.load(open("vector_vocabulary.pkl", "rb")))
  data=count_vector.transform(data)
  trained_model=pickle.load(open("LinearSVC.pkl", 'rb'))
  #print(trained_model.predict(data))
  op.append(trained_model.predict(data).flatten().tolist()[0])
  #value['output'] = trained_model.predict(data)


 
 


In [None]:
test1['CLASS'] = op

In [None]:
test1

In [None]:
test1.drop('CONTENT', axis = 1, inplace = True)

In [None]:
test1.set_index('ID', inplace = True)

In [None]:
print(test1)