In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
combi = train_df.append(test_df, ignore_index=True, sort=False)

In [4]:
combi

Unnamed: 0,id,label,tweet
0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0.0,Finally a transparant silicon case ^^ Thanks t...
2,3,0.0,We love this! Would you go? #talk #makememorie...
3,4,0.0,I'm wired I know I'm George I was made that wa...
4,5,1.0,What amazing service! Apple won't even talk to...
...,...,...,...
9868,9869,,"#SamsungGalaxyNote7 Explodes, Burns 6-Year-Old..."
9869,9870,,Now Available - Hoodie. Check it out here - ht...
9870,9871,,There goes a crack right across the screen. If...
9871,9872,,@codeofinterest as i said #Adobe big time we m...


In [5]:
train_df

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [6]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt    

In [7]:
# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")

In [8]:
# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [9]:
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [10]:
combi.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,#fingerprint #Pregnancy Test https MfQV #andro...
1,2,0.0,Finally a transparant silicon case ^^ Thanks t...,Finally transparant silicon case Thanks uncle ...
2,3,0.0,We love this! Would you go? #talk #makememorie...,love this Would #talk #makememories #unplug #r...
3,4,0.0,I'm wired I know I'm George I was made that wa...,wired know George made that #iphone #cute #dav...
4,5,1.0,What amazing service! Apple won't even talk to...,What amazing service Apple even talk about que...


In [11]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [#fingerprint, #Pregnancy, Test, https, MfQV, ...
1    [Finally, transparant, silicon, case, Thanks, ...
2    [love, this, Would, #talk, #makememories, #unp...
3    [wired, know, George, made, that, #iphone, #cu...
4    [What, amazing, service, Apple, even, talk, ab...
Name: tidy_tweet, dtype: object

In [12]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [#fingerprint, #pregnanc, test, http, mfqv, #a...
1    [final, transpar, silicon, case, thank, uncl, ...
2    [love, thi, would, #talk, #makememori, #unplug...
3    [wire, know, georg, made, that, #iphon, #cute,...
4    [what, amaz, servic, appl, even, talk, about, ...
Name: tidy_tweet, dtype: object

In [13]:
tokenized_tweet

0       [#fingerprint, #pregnanc, test, http, mfqv, #a...
1       [final, transpar, silicon, case, thank, uncl, ...
2       [love, thi, would, #talk, #makememori, #unplug...
3       [wire, know, georg, made, that, #iphon, #cute,...
4       [what, amaz, servic, appl, even, talk, about, ...
                              ...                        
9868    [#samsunggalaxynot, explod, burn, year, thank,...
9869    [avail, hoodi, check, here, http, zetasuppli, ...
9870    [there, goe, crack, right, across, screen, cou...
9871             [said, #adob, time, well, includ, #appl]
9872    [final, thanx, father, #samsung, #galaxi, #gif...
Name: tidy_tweet, Length: 9873, dtype: object

In [14]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

tokenized_tweet

0       #fingerprint #pregnanc test http mfqv #android...
1       final transpar silicon case thank uncl #yay #s...
2       love thi would #talk #makememori #unplug #rela...
3       wire know georg made that #iphon #cute #davent...
4       what amaz servic appl even talk about question...
                              ...                        
9868    #samsunggalaxynot explod burn year thank rush ...
9869    avail hoodi check here http zetasuppli product...
9870    there goe crack right across screen could actu...
9871                    said #adob time well includ #appl
9872    final thanx father #samsung #galaxi #gift #fat...
Name: tidy_tweet, Length: 9873, dtype: object

In [15]:
train_df['tweet'] = tokenized_tweet

In [16]:
#vocab = cv.vocabulary_
y = train_df.iloc[:, 1].values

In [17]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
train_df['tweet'].tail()

7915    live loud #lol #liveoutloud #selfi #smile #son...
7916    would like wish amaz make everi minut count #t...
7917    help love year neighbor with ipad thi morn jus...
7918    final #smart #pocket #wifi stay connect anytim...
7919    appl barcelona #appl #store #bcn #barcelona #t...
Name: tweet, dtype: object

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(combi['tidy_tweet'])

In [26]:
train_bow = bow[:7920,:]
test_bow = bow[7920:,:]

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_bow, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state = 101)

In [59]:
from sklearn.linear_model import LogisticRegression
lr_cv = LogisticRegression()
lr_cv.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
y_pred = lr_cv.predict(X_test)

In [66]:
y_pred_lr = lr_cv.predict_proba(X_test)
y_pred_lr_int = y_pred_lr[:,1] >= 0.30
y_pred_lr_int = y_pred_lr_int.astype(np.int)

In [67]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm = confusion_matrix(y_test, y_pred_lr_int)
ac = accuracy_score(y_test, y_pred_lr_int)
roc = roc_auc_score(y_test, y_pred_lr_int)
print(classification_report(y_test, y_pred_lr_int))

              precision    recall  f1-score   support

           0       0.95      0.88      0.92      1450
           1       0.73      0.88      0.80       530

    accuracy                           0.88      1980
   macro avg       0.84      0.88      0.86      1980
weighted avg       0.89      0.88      0.88      1980



In [68]:
test_pred = lr_cv.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test_df['label'] = test_pred_int
submission = test_df[['id','label']]
submission.to_csv('sub_lr_bow.csv', index=False) # writing data to a CSV file

In [31]:
from sklearn.svm import SVC
svc_cv = SVC(kernel='linear', gamma=0.1)
svc_cv.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
y_pred_svc = svc_cv.predict(X_test)

In [33]:
cm = confusion_matrix(y_test, y_pred_svc)
ac = accuracy_score(y_test, y_pred_svc)
roc = roc_auc_score(y_test, y_pred_svc)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1450
           1       0.76      0.78      0.77       530

    accuracy                           0.87      1980
   macro avg       0.84      0.85      0.84      1980
weighted avg       0.88      0.87      0.87      1980



In [None]:
test_pred = model.predict(test_bow)
# test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred.astype(np.int)
test_df['label'] = test_pred_int
submission = test_df[['id','label']]
submission.to_csv('sub_ann_bow.csv', index=False) # writing data to a CSV file

In [None]:
import xgboost as xgb
xgbc = xgb.XGBClassifier(learning_rate=0.05, n_estimators=500, max_depth=5, gamma=0.5)
xgbc.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgbc.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred_xgb)
ac = accuracy_score(y_test, y_pred_xgb)
roc = roc_auc_score(y_test, y_pred_xgb)
print(classification_report(y_test, y_pred_xgb))

In [69]:
from sklearn.ensemble import RandomForestClassifier
rfc_cv = RandomForestClassifier(n_estimators=1000, warm_start=True)
rfc_cv.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [70]:
y_pred_rfc = rfc_cv.predict(X_test)

In [71]:
cm = confusion_matrix(y_test, y_pred_rfc)
ac = accuracy_score(y_test, y_pred_rfc)
roc = roc_auc_score(y_test, y_pred_rfc)
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1450
           1       0.76      0.72      0.74       530

    accuracy                           0.86      1980
   macro avg       0.83      0.82      0.82      1980
weighted avg       0.86      0.86      0.86      1980



In [74]:
y_pred_rfc = rfc_cv.predict_proba(X_test)
y_pred_rfc_int = y_pred_rfc[:,1] >= 0.3
y_pred_rfc_int = y_pred_rfc_int.astype(np.int)

In [75]:
cm = confusion_matrix(y_test, y_pred_rfc_int)
ac = accuracy_score(y_test, y_pred_rfc_int)
roc = roc_auc_score(y_test, y_pred_rfc_int)
print(classification_report(y_test, y_pred_rfc_int))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90      1450
           1       0.70      0.88      0.78       530

    accuracy                           0.87      1980
   macro avg       0.83      0.87      0.84      1980
weighted avg       0.88      0.87      0.87      1980



In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_cv = DecisionTreeClassifier(max_depth=25)
dt_cv.fit(X_train, y_train)

In [None]:
y_pred_dt = dt_cv.predict_proba(X_test)
y_pred_dt_int = y_pred_dt[:,1] >= 0.3
y_pred_dt_int = y_pred_dt_int.astype(np.int)

In [None]:
cm = confusion_matrix(y_test, y_pred_dt_int)
ac = accuracy_score(y_test, y_pred_dt_int)
roc = roc_auc_score(y_test, y_pred_dt_int)
print(classification_report(y_test, y_pred_dt_int))

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(combi['tidy_tweet'])

In [76]:
train_tfidf = tfidf[:7920,:]
test_tfidf = tfidf[7920:,:]

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, y, random_state=101, test_size=0.25)

lr_cv.fit(xtrain_tfidf, y_train)

prediction = lr_cv.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)



In [77]:
cm = confusion_matrix(yvalid, prediction_int)
ac = accuracy_score(yvalid, prediction_int)
roc = roc_auc_score(yvalid, prediction_int)
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      1450
           1       0.71      0.88      0.79       530

    accuracy                           0.87      1980
   macro avg       0.83      0.88      0.85      1980
weighted avg       0.89      0.87      0.88      1980

