In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import seaborn as sns
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix,classification_report

In [3]:
df1 = pd.read_csv('clickbait_data_1.txt',sep='\x01',header=None)
df2 = pd.read_csv('non_clickbait_data_1.txt',sep='\x01',header=None)

In [4]:
#For Clickbait class variable = 1
df1['Class']=1.0
#For NonClickbait class variable = 0
df2['Class']=0.0

porter = PorterStemmer()
sc= StandardScaler()
cleandf_entire= []

In [5]:
def generateLength (df):
    df['length']= df[0].apply(len)
    df=df.rename(index=str,columns={0:'Text'})
    df.fillna(0)
    return df

def concatinateDataframes (df1, df2):
    df3 = pd.concat([df1,df2],ignore_index= True)
    return df3

def cleanDataframe(df):
    for i in range(0,len(df)):
        cleandf = re.sub('[^a-zA-Z0-9]',' ',df['Text'][i])
        cleandf = cleandf.lower()
        cleandf = [porter.stem(word) for word in cleandf.split() if word not in set(stopwords.words('english'))]
        cleandf = ' '.join(cleandf)
        cleandf_entire.append(cleandf)
    return cleandf_entire

In [46]:
def dfVectorizeCountVec(cleandf_entire):
    cv = CountVectorizer()
    #cv = CountVectorizer(max_features=10000)
    matrix_df = cv.fit_transform(cleandf_entire).toarray()
    return matrix_df


def dfVectorizeTfIdfVec(cleandf_entire):
    vectorizer = TfidfVectorizer()
    matrix_df = vectorizer.fit_transform(cleandf_entire).toarray()
    return matrix_df


def getClassLableDataframe(df):
    return (df.iloc[:,1].values)

def getScaledDataframe(df):
    return (sc.fit_transform(df.astype(float)))
    

In [7]:
df1 = generateLength (df1)
df2 = generateLength (df2)

In [8]:
df1.head(2)

Unnamed: 0,Text,Class,length
0,Should I Get Bings,1.0,18
1,Which TV Female Friend Group Do You Belong In,1.0,45


In [9]:
df2.head(2)

Unnamed: 0,Text,Class,length
0,Bill Changing Credit Card Rules Is Sent to Oba...,0.0,74
1,"In Hollywood, the Easy-Money Generation Toughe...",0.0,51


In [10]:
concated_df = concatinateDataframes (df1, df2)

In [11]:
concated_df.head(2)

Unnamed: 0,Text,Class,length
0,Should I Get Bings,1.0,18
1,Which TV Female Friend Group Do You Belong In,1.0,45


In [12]:
concated_df.tail(2)

Unnamed: 0,Text,Class,length
5656,Saudis Delay Local Elections by 2 Years,0.0,39
5657,Police arrest train passenger for a 16-hour lo...,0.0,71


In [13]:
cleaned_df = cleanDataframe(concated_df)

In [14]:
cleaned_df

['get bing',
 'tv femal friend group belong',
 'new star war forc awaken trailer give chill',
 'vine new york celebr big brother fuck perfect',
 'coupl stun photo shoot babi learn inoper brain tumor',
 'flirt queer girl without make total fool',
 '32 cute thing distract awkward thanksgiv',
 'disney princess florida',
 'quot lyric best describ depress',
 'natali dormer sam claflin play game see actual last hunger game',
 '16 perfect respons indian patriarchi',
 '21 time die captain america civil war teaser',
 '17 time kourtney kardashian shut famili',
 'coffe make poop',
 'celebr ex base zodiac',
 '17 hairdress struggl everi black girl know true',
 'walter white heisenberg',
 'canadian groom ever left wed plow guest snow storm',
 'one realli weird thing butterfre',
 '15 resolut make good 2016',
 'new thing tri 2016',
 'zoo anim around world open christma present earli',
 'tell us ie erica ash',
 '9 time cri',
 '21 vegetarian dump dinner crock pot',
 'goat bulli tiger friend',
 '8 fall s

In [15]:
vectorized_df = dfVectorizeCountVec(cleaned_df)

In [16]:
vectorized_df

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
classLables_df = getClassLableDataframe(concated_df)

In [18]:
classLables_df

array([1., 1., 1., ..., 0., 0., 0.])

In [19]:
X_train,X_test,y_train,y_test = train_test_split(vectorized_df,classLables_df, test_size =0.25, stratify=classLables_df)

In [20]:
X_train = getScaledDataframe(X_train.astype(float))
X_test = getScaledDataframe(X_test.astype(float))

In [27]:
def generateNaiveBayesClassifier (X_train,y_train):
    from sklearn.naive_bayes import GaussianNB
    classifier1 = GaussianNB()
    classifier1.fit(X_train,y_train)
    #NBClassifier = pickle.dumps(classifier1)
    filename = 'finalized_model_NaiveBayes.sav'
    pickle.dump(classifier1, open(filename, 'wb'))
    
def predictNaiveBayesClassifier (X_test):
    #from keras.models import load_model
    #model = load_model('NBClassifier.h5')
    #import pickle
    filename = 'finalized_model_NaiveBayes.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    #model = pickle.loads(NBClassifier)
    y_pred = loaded_model.predict(X_test)
    return y_pred

def generateConfusionMatrix (y_test,y_pred):
    cm= confusion_matrix(y_test,y_pred)
    return cm

In [28]:
def generateDecisionTreeClassifier (X_train,y_train):
    from sklearn.tree import DecisionTreeClassifier
    classifier2 = DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier2.fit(X_train,y_train)
    filename = 'finalized_model_DecisionTree.sav'
    pickle.dump(classifier2, open(filename, 'wb'))
    
def predictDecisionTreeClassifier (X_test):
    filename = 'finalized_model_DecisionTree.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X_test)
    return y_pred

In [29]:
def generateSVMClassifier (X_train,y_train):
    from sklearn.svm import SVC
    classifier3 = SVC(kernel='linear')
    classifier3.fit(X_train,y_train)
    filename = 'finalized_model_SVM.sav'
    pickle.dump(classifier3, open(filename, 'wb'))
    
def predictSVMClassifier (X_test):
    filename = 'finalized_model_SVM.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X_test)
    return y_pred

In [30]:
def generateKNNClassifier (X_train,y_train):
    from sklearn.neighbors import KNeighborsClassifier
    classifier4 = KNeighborsClassifier(n_neighbors=3,weights='distance',metric='chebyshev')
    classifier4.fit(X_train,y_train)
    filename = 'finalized_model_KNN.sav'
    pickle.dump(classifier4, open(filename, 'wb'))
    
def predictKNNClassifier (X_test):
    filename = 'finalized_model_KNN.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X_test)
    return y_pred

In [37]:
def generateRandomForestClassifier (X_train,y_train):
    from sklearn.ensemble import RandomForestClassifier    
    classifier5 = RandomForestClassifier(n_estimators=10,criterion="entropy",random_state=0)
    classifier5.fit(X_train,y_train)
    filename = 'finalized_model_RandomForest.sav'
    pickle.dump(classifier5, open(filename, 'wb'))
    
def predictRandomForestClassifier (X_test):
    filename = 'finalized_model_RandomForest.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_pred = loaded_model.predict(X_test)
    return y_pred

In [32]:
generateNaiveBayesClassifier (X_train,y_train)
y_pred_NB = predictNaiveBayesClassifier (X_test)
cm_NB = generateConfusionMatrix (y_test,y_pred_NB)
print(classification_report(y_test,y_pred_NB))
cm_NB

             precision    recall  f1-score   support

        0.0       0.81      0.94      0.87       790
        1.0       0.91      0.72      0.80       625

avg / total       0.85      0.84      0.84      1415



array([[744,  46],
       [177, 448]], dtype=int64)

In [33]:
generateDecisionTreeClassifier (X_train,y_train)
y_pred_DT = predictDecisionTreeClassifier (X_test)
cm_DT = generateConfusionMatrix (y_test,y_pred_DT)
print(classification_report(y_test,y_pred_DT))
cm_DT

             precision    recall  f1-score   support

        0.0       0.86      0.90      0.88       790
        1.0       0.86      0.81      0.84       625

avg / total       0.86      0.86      0.86      1415



array([[708,  82],
       [116, 509]], dtype=int64)

In [34]:
generateSVMClassifier (X_train,y_train)
y_pred_SVM = predictSVMClassifier (X_test)
cm_SVM = generateConfusionMatrix (y_test,y_pred_SVM)
print(classification_report(y_test,y_pred_SVM))
cm_SVM

             precision    recall  f1-score   support

        0.0       0.91      0.90      0.91       790
        1.0       0.88      0.89      0.88       625

avg / total       0.90      0.90      0.90      1415



array([[711,  79],
       [ 67, 558]], dtype=int64)

In [35]:
generateKNNClassifier (X_train,y_train)
y_pred_KNN = predictKNNClassifier (X_test)
cm_KNN = generateConfusionMatrix (y_test,y_pred_KNN)
print(classification_report(y_test,y_pred_KNN))
cm_KNN

             precision    recall  f1-score   support

        0.0       0.74      0.38      0.50       790
        1.0       0.51      0.83      0.63       625

avg / total       0.64      0.58      0.56      1415



array([[303, 487],
       [109, 516]], dtype=int64)

In [38]:
generateRandomForestClassifier (X_train,y_train)
y_pred_RF = predictRandomForestClassifier (X_test)
cm_RF = generateConfusionMatrix (y_test,y_pred_RF)
print(classification_report(y_test,y_pred_RF))
cm_RF

             precision    recall  f1-score   support

        0.0       0.85      0.95      0.90       790
        1.0       0.93      0.79      0.86       625

avg / total       0.89      0.88      0.88      1415



array([[751,  39],
       [129, 496]], dtype=int64)

In [39]:
def preprocessUserInput (s1):
    cleanedUI = cleanDataframe (s1) 
    vectorizedUI = dfVectorizeCountVec (cleanedUI)
    #vectorizedUI = vectorizedUI[-1]
    print ("vectorizedUI")
    print (vectorizedUI)
    scaledUI = getScaledDataframe(vectorizedUI)
    scaledUI = scaledUI[-1]
    print (scaledUI)
    return scaledUI

In [40]:
def predictUI(scaledUI):
    p_NB = predictNaiveBayesClassifier (scaledUI)
    print("Prediction using NaiveBayes : {0}".format('Clickbait' if p_NB[0] == 1 else 'Non Clickbait'))
    p_DT = predictDecisionTreeClassifier (scaledUI)
    print("Prediction using DecisionTree : {0}".format('Clickbait' if p_DT[0] == 1 else 'Non Clickbait'))
    p_SVM = predictSVMClassifier (scaledUI)
    print("Prediction using SVM : {0}".format('Clickbait' if p_SVM[0] == 1 else 'Non Clickbait'))
    p_KNN = predictKNNClassifier (scaledUI)
    print("Prediction using KNN : {0}".format('Clickbait' if p_KNN[0] == 1 else 'Non Clickbait'))
    p_RF = predictRandomForestClassifier (scaledUI)
    print("Prediction using RandomForest : {0}".format('Clickbait' if p_RF[0] == 1 else 'Non Clickbait'))

In [41]:
print("Enter sentence to be checked")

Enter sentence to be checked


In [42]:
user_input = input()

32 Cute Things To Distract From Your Awkward Thanksgiving


In [43]:
print (user_input)

32 Cute Things To Distract From Your Awkward Thanksgiving


In [47]:
UI = preprocessUserInput (pd.DataFrame([user_input],columns =['Text']))
UI

vectorizedUI
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[-0.0497959  -0.06923282 -0.01329322 ... -0.01329322 -0.01880111
 -0.01329322]


array([-0.0497959 , -0.06923282, -0.01329322, ..., -0.01329322,
       -0.01880111, -0.01329322])

In [48]:
predictUI([UI])

Prediction using NaiveBayes : Clickbait
Prediction using DecisionTree : Clickbait
Prediction using SVM : Clickbait
Prediction using KNN : Clickbait
Prediction using RandomForest : Clickbait
