In [9]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string


from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

In [69]:
import matplotlib.pyplot as plt
%matplotlib inline

In [65]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


#### Convert to Lower

In [4]:
train['ctweet'] = train.tweet.str.lower()
test['ctweet'] = test.tweet.str.lower()

#### Remove URLS

In [5]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [6]:
train['ctweet'] = train.ctweet.apply(remove_urls)
test['ctweet'] = test.ctweet.apply(remove_urls)

#### Remove Special Chars

In [7]:
def remove_speical_chars(text):
    text = re.sub(r"[^a-z ]"," ",text)
    return re.sub(' +', ' ',text)

In [8]:
train['ctweet'] = train.ctweet.apply(remove_speical_chars)
test['ctweet'] = test.ctweet.apply(remove_speical_chars)

#### Lemmatization

In [21]:
nlp = spacy.load('en_core_web_md')

In [26]:
def lemmatize(text):
    doc = nlp(text)
    return " ".join([d.lemma_ for d in doc])

In [10]:
def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [14]:
train['ctweet'] = train.ctweet.apply(lemmatize)
test['ctweet'] = test.ctweet.apply(lemmatize)

#### Remove words less than 3 len

In [31]:
def rem_len(text):
    return [word for word in text.split() if len(word) >3]

In [32]:
train['ctweet'] = train.ctweet.apply(rem_len)
test['ctweet'] = test.ctweet.apply(rem_len)

AttributeError: 'list' object has no attribute 'split'

In [33]:
train.ctweet = train.ctweet.apply(lambda x : " ".join(x))

In [34]:
train.head()

Unnamed: 0,id,label,tweet,ctweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally transparant silicon case thanks uncle ...
2,3,0,We love this! Would you go? #talk #makememorie...,love this would talk makememories unplug relax...
3,4,0,I'm wired I know I'm George I was made that wa...,wired know george made that iphone cute davent...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple even talk about que...


In [35]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

In [36]:
cnt = CountVectorizer()

In [58]:
tfidf = TfidfVectorizer(min_df=2,max_df=.9,stop_words='english')

In [59]:
train_vec = tfidf.fit_transform(train.ctweet)

In [60]:
train_vec.toarray().shape

(7920, 4753)

In [63]:
mnb_clf = MultinomialNB()

In [64]:
X = train_vec
y = train.label

In [68]:
y.value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [66]:
cv_score = cross_val_score(mnb_clf,X,y,cv=4,scoring='f1')

In [67]:
cv_score

array([0.75664187, 0.73447537, 0.74514039, 0.74463519])