In [117]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('brown')
from nltk.corpus import brown
import string

[nltk_data] Downloading package brown to /home/vinicius/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [118]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [119]:
train = pd.read_json('./data/train.json')
test = pd.read_json('./data/test.json')

# 1.0 - Cleaning

In [120]:
print("DF lenght:",len(train))
train.isnull().sum()

DF lenght: 4040


giver_username_if_known                                    0
number_of_downvotes_of_request_at_retrieval                0
number_of_upvotes_of_request_at_retrieval                  0
post_was_edited                                            0
request_id                                                 0
request_number_of_comments_at_retrieval                    0
request_text                                               0
request_text_edit_aware                                    0
request_title                                              0
requester_account_age_in_days_at_request                   0
requester_account_age_in_days_at_retrieval                 0
requester_days_since_first_post_on_raop_at_request         0
requester_days_since_first_post_on_raop_at_retrieval       0
requester_number_of_comments_at_request                    0
requester_number_of_comments_at_retrieval                  0
requester_number_of_comments_in_raop_at_request            0
requester_number_of_comm

**Many NULL values on "requester_user_flair"**

Definition: Users on RAOP receive badges (Reddit calls them flairs) which is a small picture next to their username. In our data set the user flair is either None (neither given nor received pizza, N=4282), "shroom" (received pizza, but not given, N=1306), or "PIF" (pizza given after having received, N=83).

In [121]:
train.drop(['requester_user_flair'],axis=1,inplace=True)

In [122]:
len(train[train['giver_username_if_known']=='N/A'])

3753

**To many N/A values on "giver_username_if_known" will drop this column**

In [123]:
train.drop('giver_username_if_known',axis=1,inplace=True)

In [124]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4040 entries, 0 to 4039
Data columns (total 30 columns):
number_of_downvotes_of_request_at_retrieval             4040 non-null int64
number_of_upvotes_of_request_at_retrieval               4040 non-null int64
post_was_edited                                         4040 non-null int64
request_id                                              4040 non-null object
request_number_of_comments_at_retrieval                 4040 non-null int64
request_text                                            4040 non-null object
request_text_edit_aware                                 4040 non-null object
request_title                                           4040 non-null object
requester_account_age_in_days_at_request                4040 non-null float64
requester_account_age_in_days_at_retrieval              4040 non-null float64
requester_days_since_first_post_on_raop_at_request      4040 non-null float64
requester_days_since_first_post_on_raop_at_retr

In [125]:
## Drop Request ID (object columns with probably no value)
train.drop('request_id',axis=1,inplace=True)

In [126]:
## Drop request raw text, will use request_text_edit_aware to predict
train.drop(['request_text'],axis=1,inplace=True)

In [127]:
## Drop requester_subreddits_at_request and username
train.drop(['requester_subreddits_at_request','requester_username'],axis=1,inplace=True)

**Now we have two object data, they are:  
1.request_text_edit_aware  
2.request_title**  



In [128]:
## Rename text and title columns
train.rename(columns={
    'request_text_edit_aware':'text',
    'request_title':'title'  
},inplace=True)

## Rename text and title columns
test.rename(columns={
    'request_text_edit_aware':'text',
    'request_title':'title'  
},inplace=True)


# 2.0 - Preparation

In [129]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [130]:
## Function to clean text (stopwords, punct and stemming)

def clean(text):
    sw = stopwords.words('english')
    stemmer = SnowballStemmer("english")
    
    no_punc = text.translate(str.maketrans('','',string.punctuation))
    
    no_punc = [word.lower() for word in no_punc.split() if word.lower() not in sw]
    
    no_stop = " ".join(no_punc)
    
    stem = [stemmer.stem(word) for word in no_stop.split()]
    
    return " ".join(stem)

In [131]:
## Apply cleaning function on text
train['text'] = train['text'].apply(clean)

In [132]:
## Apply function on title
train['title'] = train['title'].apply(clean)

# 3.0 - Modeling

In [175]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [210]:
xt = train.drop('requester_received_pizza',axis=1)
yt = train['requester_received_pizza']

In [208]:
vectorizer = CountVectorizer(analyzer='word', binary=True)

In [178]:
vectorizer.fit(xt['text'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [206]:
X = vectorizer.transform(xt['text']).todense()
y = yt.values
X.shape, y.shape

((4040, 9793), (4040,))

In [180]:
train['requester_received_pizza'].value_counts()

False    3046
True      994
Name: requester_received_pizza, dtype: int64

In [181]:
Xtrain, Xval, ytrain, yval = train_test_split(X,y)

In [182]:
model = MultinomialNB()

In [193]:
model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [194]:
y_pred = model.predict(Xval)

In [195]:
print(classification_report(yval,y_pred))

              precision    recall  f1-score   support

       False       0.84      1.00      0.91       763
        True       0.98      0.43      0.59       247

    accuracy                           0.86      1010
   macro avg       0.91      0.71      0.75      1010
weighted avg       0.88      0.86      0.84      1010



# 4.0 - Submission

In [196]:
Xtest = vectorizer.transform(test['text']).todense()

In [197]:
Xtest.shape

(1631, 9793)

In [198]:
y_pred = model.predict(Xtest)

In [199]:
df_sub = test[['request_id']]

In [200]:
df_sub['requester_received_pizza'] = y_pred.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [202]:
df_sub['requester_received_pizza'].value_counts()

0    1417
1     214
Name: requester_received_pizza, dtype: int64

In [203]:
df_sub.to_csv("first-submission-pizza.csv", index=False)

# 5.0 - New Approach

In [214]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [204]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [211]:
X_train, X_val, y_train, y_val = train_test_split(xt,yt)

In [213]:
train_vectors = count_vectorizer.fit_transform(X_train["text"])
val_vectors = count_vectorizer.transform(X_val["text"])

In [215]:
clf = linear_model.RidgeClassifier()


In [221]:
scores = model_selection.cross_val_score(clf, train_vectors, y_train, cv=3, scoring="f1")
scores

array([0.2744186 , 0.27826087, 0.30804598])

In [222]:
clf.fit(train_vectors, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [235]:
y_pred = clf.predict(val_vectors)

In [236]:
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

       False       0.76      0.81      0.78       758
        True       0.28      0.22      0.25       252

    accuracy                           0.66      1010
   macro avg       0.52      0.52      0.51      1010
weighted avg       0.64      0.66      0.65      1010



In [225]:
test_vectors = count_vectorizer.transform(test['text'])

In [226]:
pred = clf.predict(test_vectors)

In [227]:
df_sub2 = test[['request_id']]

In [230]:
df_sub2['requester_received_pizza'] = pred.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [232]:
df_sub2.requester_received_pizza.value_counts()

0    1456
1     175
Name: requester_received_pizza, dtype: int64

In [233]:
df_sub2.to_csv("alternative-submission-pizza.csv", index=False)