In [1]:
import pandas as pd

from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report




In [2]:
data = pd.read_csv('messages.csv')
data.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


### Preprocess Data

In [3]:
add_stopwords = set(stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])
len(add_stopwords) 

188

In [4]:
def clean_text(x):
    global after_clean
    after_clean = []
    no_punc = "".join([char for char in x if char not in string.punctuation])
    no_stopwords = [word for word in no_punc.lower().split() if word not in add_stopwords]
    # print(no_stopwords)
    after_clean = " ".join(no_stopwords)


    return after_clean



def lemmatize(sentence):
    result = []
    clean_text(sentence)

    lemma_process = [WordNetLemmatizer().lemmatize(word) for word in after_clean]
    result = "".join(lemma_process)

    return result




In [5]:
data['clean_text'] =data['message'].apply(lambda a :lemmatize(a))
data.head()

Unnamed: 0,subject,message,label,clean_text
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,content length 3386 appleiss research center u...
1,,"lang classification grimes , joseph e . and ba...",0,lang classification grimes joseph e barbara f ...
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0,posting inquiry sergei atamas satamas umabnet ...
3,risk,a colleague and i are researching the differin...,0,colleague researching differing degrees risk p...
4,request book information,earlier this morning i was on the phone with a...,0,earlier morning phone friend mine living south...


### TF-IDF

In [6]:
tf_vec = TfidfVectorizer(use_idf=True)
features = tf_vec.fit_transform(data['clean_text'])
features

<2893x64629 sparse matrix of type '<class 'numpy.float64'>'
	with 532093 stored elements in Compressed Sparse Row format>

In [7]:
# identify important words
words_df = pd.DataFrame(features[0].T.todense(), index=tf_vec.get_feature_names(), columns=["TF-IDF"])
words_df = words_df.sort_values('TF-IDF', ascending=False)
print (words_df.head(25))

                  TF-IDF
unix            0.227849
experience      0.211178
singapore       0.209923
apple           0.187985
productization  0.175052
knowledge       0.171319
expertise       0.169171
speech          0.161337
iss             0.155673
programmer      0.151817
required        0.146049
language        0.128035
models          0.123766
research        0.119354
center          0.117619
candidate       0.106558
modeling        0.105349
industry        0.103121
65              0.099561
statistical     0.098402
years           0.098108
located         0.097037
positions       0.094546
engineering     0.093181
strong          0.092962


### Modelling

In [8]:
y= data['label']
xtrain,xtest,ytrain,ytest = train_test_split(features,y, random_state=42)
print(xtrain.shape, features.shape)

(2169, 64629) (2893, 64629)


In [9]:
naive_model = MultinomialNB()
naive_model.fit(xtrain,ytrain)
naive_pred = naive_model.predict(xtest)


naive_result = classification_report(naive_pred, ytest)
print(naive_result)

              precision    recall  f1-score   support

           0       1.00      0.83      0.91       705
           1       0.14      1.00      0.24        19

    accuracy                           0.83       724
   macro avg       0.57      0.91      0.57       724
weighted avg       0.98      0.83      0.89       724



In [10]:
AdaBoost_model = AdaBoostClassifier()
AdaBoost_model.fit(xtrain,ytrain)
AdaBoost_pred = AdaBoost_model.predict(xtest)

AdaBoost_result = classification_report(AdaBoost_pred,ytest)
print(AdaBoost_result)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       588
           1       0.95      0.97      0.96       136

    accuracy                           0.98       724
   macro avg       0.97      0.98      0.98       724
weighted avg       0.98      0.98      0.98       724

