In [67]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('brown')
from nltk.corpus import brown
import string

[nltk_data] Downloading package brown to /home/vinicius/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [68]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [69]:
train = pd.read_json('./data/train.json')
test = pd.read_json('./data/test.json')

# 1.0 - Cleaning

In [70]:
print("DF lenght:",len(train))
train.isnull().sum()

DF lenght: 4040


giver_username_if_known                                    0
number_of_downvotes_of_request_at_retrieval                0
number_of_upvotes_of_request_at_retrieval                  0
post_was_edited                                            0
request_id                                                 0
request_number_of_comments_at_retrieval                    0
request_text                                               0
request_text_edit_aware                                    0
request_title                                              0
requester_account_age_in_days_at_request                   0
requester_account_age_in_days_at_retrieval                 0
requester_days_since_first_post_on_raop_at_request         0
requester_days_since_first_post_on_raop_at_retrieval       0
requester_number_of_comments_at_request                    0
requester_number_of_comments_at_retrieval                  0
requester_number_of_comments_in_raop_at_request            0
requester_number_of_comm

**Many NULL values on "requester_user_flair"**

Definition: Users on RAOP receive badges (Reddit calls them flairs) which is a small picture next to their username. In our data set the user flair is either None (neither given nor received pizza, N=4282), "shroom" (received pizza, but not given, N=1306), or "PIF" (pizza given after having received, N=83).

In [71]:
train.drop(['requester_user_flair'],axis=1,inplace=True)

In [72]:
len(train[train['giver_username_if_known']=='N/A'])

3753

**To many N/A values on "giver_username_if_known" will drop this column**

In [73]:
train.drop('giver_username_if_known',axis=1,inplace=True)

In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4040 entries, 0 to 4039
Data columns (total 30 columns):
number_of_downvotes_of_request_at_retrieval             4040 non-null int64
number_of_upvotes_of_request_at_retrieval               4040 non-null int64
post_was_edited                                         4040 non-null int64
request_id                                              4040 non-null object
request_number_of_comments_at_retrieval                 4040 non-null int64
request_text                                            4040 non-null object
request_text_edit_aware                                 4040 non-null object
request_title                                           4040 non-null object
requester_account_age_in_days_at_request                4040 non-null float64
requester_account_age_in_days_at_retrieval              4040 non-null float64
requester_days_since_first_post_on_raop_at_request      4040 non-null float64
requester_days_since_first_post_on_raop_at_retr

In [75]:
## Drop Request ID (object columns with probably no value)
train.drop('request_id',axis=1,inplace=True)

In [76]:
## Drop request raw text, will use request_text_edit_aware to predict
train.drop(['request_text'],axis=1,inplace=True)

In [77]:
## Drop requester_subreddits_at_request and username
train.drop(['requester_subreddits_at_request','requester_username'],axis=1,inplace=True)

**Now we have two object data, they are:  
1.request_text_edit_aware  
2.request_title**  



In [78]:
## Rename text and title columns
train.rename(columns={
    'request_text_edit_aware':'text',
    'request_title':'title'  
},inplace=True)

## Rename text and title columns
test.rename(columns={
    'request_text_edit_aware':'text',
    'request_title':'title'  
},inplace=True)


# 2.0 - Preparation

In [79]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [80]:
## Function to clean text (stopwords, punct and stemming)

def clean(text):
    sw = stopwords.words('english')
    stemmer = SnowballStemmer("english")
    
    no_punc = text.translate(str.maketrans('','',string.punctuation))
    
    no_punc = [word.lower() for word in no_punc.split() if word.lower() not in sw]
    
    no_stop = " ".join(no_punc)
    
    stem = [stemmer.stem(word) for word in no_stop.split()]
    
    return " ".join(stem)

In [81]:
## Apply cleaning function on text
train['text'] = train['text'].apply(clean)

In [82]:
## Apply function on title
train['title'] = train['title'].apply(clean)

# 3.0 - Modeling

In [83]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [84]:
xt = train.drop('requester_received_pizza',axis=1)
yt = train['requester_received_pizza']

In [85]:
vectorizer = CountVectorizer(analyzer='word', binary=True)

In [86]:
vectorizer.fit(xt['text'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [87]:
X = vectorizer.transform(xt['text']).todense()
y = yt.values
X.shape, y.shape

((4040, 9793), (4040,))

In [88]:
Xtrain, Xval, ytrain, yval = train_test_split(X,y)

In [89]:
model = LGBMClassifier(n_estimators=500,learning_rate=0.003, num_leaves=2**6, subsample=0.75, subsample_freq=1, colsample_bytree=1,
                   random_state=0)

In [90]:
model.fit(Xtrain, ytrain)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
               importance_type='split', learning_rate=0.003, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=64, objective=None,
               random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.75, subsample_for_bin=200000, subsample_freq=1)

In [91]:
y_pred = model.predict(Xval)

In [92]:
print(classification_report(yval,y_pred))

              precision    recall  f1-score   support

       False       0.76      1.00      0.86       761
        True       1.00      0.02      0.03       249

    accuracy                           0.76      1010
   macro avg       0.88      0.51      0.45      1010
weighted avg       0.82      0.76      0.66      1010



# 4.0 - Submission

In [93]:
Xtest = vectorizer.transform(test['text']).todense()

In [94]:
Xtest.shape

(1631, 9793)

In [95]:
y_pred = model.predict(Xtest)

In [102]:
df_sub = test[['request_id']]

In [107]:
df_sub['requester_received_pizza'] = y_pred.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [116]:
df_sub.to_csv("first-submission-pizza.csv", index=False)