In [1]:
import pandas as pd

# train / test split 
from sklearn.model_selection import train_test_split

# Importing necessary libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

# evaluation 
from sklearn import metrics 
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report

# Task: Building an author-identifyer programm
- for each subreddit AUTOMATICALLY find redditor wiht highest score/most comments
- train a model on that author. That would the "author x or not"-model (there has to be a better multiclass way to do this)
- scan all comments in controversial in that same subreddit


### Preparing the dataset

In [2]:
# import the 2 dataframes of 2 redditors and combine them

spez = pd.read_csv('spez_comments.csv',index_col=0)
chop = pd.read_csv('chop_comments.csv',index_col=0)

In [3]:
spez['author'] = 0
chop['author'] = 1

In [4]:
spez.shape, chop.shape

((750, 4), (847, 4))

In [5]:
spez.shape[0]+chop.shape[0]

1597

In [6]:
two_authors = pd.merge(spez, chop, how='outer')
two_authors

Unnamed: 0,body,score,created,author
0,Think it only works on desktop:\n\nhttps://old...,363,2020-02-25 12:57:46,0
1,> Did you ever imagine 14 years ago that you w...,983,2020-02-25 09:29:49,0
2,We're working on a new version of mweb as we s...,282,2020-02-25 09:19:11,0
3,\> Have any communities EVER been unquarantine...,713,2020-02-25 07:55:18,0
4,"For the most part, I don't admin flag jokes.",271,2020-02-25 07:52:09,0
...,...,...,...,...
1592,Long haul trucker here... \n\nI've been doing ...,2,2018-08-12 05:32:10,1
1593,A little slice of paradise.,1,2018-08-11 12:53:38,1
1594,It's a dog inside a dog\n\nDogception,1,2017-03-21 21:06:42,1
1595,If you come from a religious background...\n\n...,-1,2017-03-20 00:44:54,1


In [7]:
# are the classes balanced? 

class_dist = two_authors.groupby('author')['author'].count()
class_dist

author
0    750
1    847
Name: author, dtype: int64

In [8]:
# distribution of the two authors 
spez_dist = class_dist[0]/two_authors.shape[0]
chop_dist = class_dist[1]/two_authors.shape[0]

print(f'spez: {spez_dist:.2f}%\nchop: {chop_dist:.2f}%')

spez: 0.47%
chop: 0.53%


In [9]:
# train test split 

X_train, X_test, y_train, y_test = train_test_split(two_authors['body'], 
                                                    two_authors['author'], 
                                                    test_size=0.20, 
                                                    random_state=42)

### Definfing models to predict author

In [10]:
# prediction the probability of a certain class

nb = MultinomialNB()
tfidf = TfidfVectorizer(min_df=2, stop_words='english')
Xtrain_trans = tfidf.fit_transform(X_train) 
X_test_trans = tfidf.transform(X_test)

In [11]:
# fitting the bayes' model on it 

fit_bayes = nb.fit(Xtrain_trans, y_train)

In [12]:
y_proba = fit_bayes.predict_proba(X_test_trans)

In [13]:
y_pred =  fit_bayes.predict(X_test_trans)
y_pred

array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,

In [14]:
# the probability to predict each class -> which class is which? 
y_proba

array([[0.46436962, 0.53563038],
       [0.3554    , 0.6446    ],
       [0.72212928, 0.27787072],
       [0.64482287, 0.35517713],
       [0.21418321, 0.78581679],
       [0.10960869, 0.89039131],
       [0.40305981, 0.59694019],
       [0.64346743, 0.35653257],
       [0.63770898, 0.36229102],
       [0.41997   , 0.58003   ],
       [0.583574  , 0.416426  ],
       [0.35357557, 0.64642443],
       [0.17335655, 0.82664345],
       [0.63186538, 0.36813462],
       [0.97493118, 0.02506882],
       [0.16076553, 0.83923447],
       [0.6929605 , 0.3070395 ],
       [0.31233391, 0.68766609],
       [0.5309102 , 0.4690898 ],
       [0.83299016, 0.16700984],
       [0.73479086, 0.26520914],
       [0.76791973, 0.23208027],
       [0.3756634 , 0.6243366 ],
       [0.70317648, 0.29682352],
       [0.63028351, 0.36971649],
       [0.5839129 , 0.4160871 ],
       [0.60405351, 0.39594649],
       [0.60405351, 0.39594649],
       [0.92072744, 0.07927256],
       [0.97406088, 0.02593912],
       [0.

In [15]:
# some pre-processing to find the number of words per comment 

def word_counter(dataframe):
    """function that count all words in a comment within a dataframe
    and a outputs that as a new column"""
    words_in_comment = []
    for comment in dataframe['body']:
        words = comment.split()
        words_in_comment.append(len(words))
    dataframe['word_num'] = words_in_comment
    return dataframe

In [16]:
# function that just fits a model and return that model

def model_trainer(X_train, y_train):
    """takes source and target, trains model and 
    returns trained Baye's model"""
    tfidf = TfidfVectorizer(min_df=2, stop_words='english')
    # keeping default hyperparameters for the moment (min_df=2, stop_words='english')
    nb = MultinomialNB()
    #creating a pipeline that transforms and fits in one go 
    model = make_pipeline(tfidf, nb)
    model.fit(X_train, y_train)
    return model 

In [17]:
# predictor function

def predictor(pipeline_model, test_data):
    """function takes an existing model, test_data
    and outputs predictions (0/1) as an array"""
    #test_data = [test_data]
    pred = pipeline_model.predict(test_data)
    return pred

In [18]:
def evaluator(y_pred):
    print("Accuracy", metrics.accuracy_score(y_test, y_pred, normalize=True))
    print("Precision", metrics.precision_score(y_test, y_pred, average='weighted'))
    print("Recall", metrics.recall_score(y_test, y_pred, average='weighted'))
    print("F1-Score", metrics.f1_score(y_test, y_pred, average='weighted'))

In [19]:
# applying the model on the dataset
bayes_model = model_trainer(X_train, y_train)

In [20]:
predicitons = predictor(bayes_model, X_test)

In [21]:
evaluator(predicitons)

Accuracy 0.8875
Precision 0.8888910188352798
Recall 0.8875
F1-Score 0.8873237781954888


In [22]:
metrics.confusion_matrix(y_test, predicitons)

array([[134,  23],
       [ 13, 150]])

In [23]:
# creating a dataframe to compare fase positives & negatives
comp_frame = pd.DataFrame(X_test)

In [24]:
# adding the predictions and true values
comp_frame['true'] = y_test
comp_frame['pred'] = predicitons
comp_frame

Unnamed: 0,body,true,pred
802,Boot camp,1,1
124,I can handle it.,0,1
350,It's going ok. I've never been a great manager...,0,0
682,The IP stuff has been an interesting challenge...,0,0
1324,I'm more worried about the ones we dont see un...,1,1
...,...,...,...
1068,I have hauled several loads of Yuengling out o...,1,1
1049,Oh ok I see. Yeah at the rate they are going i...,1,1
792,Giving a damn about what strangers think about...,1,1
962,What country are you from? \n\nCompany drivers...,1,1


In [25]:
# checking out false positives and false negatives 
false_chop = comp_frame[(comp_frame['true'] == 0) & (comp_frame['pred'] == 1)]
false_spez = comp_frame[(comp_frame['true'] == 1) & (comp_frame['pred'] == 0)]

### Assumption: amoung the falsely classified comments are overwhelmingly one word comments. A single word should/could be too little to classifiy an author. 

In [None]:
false_chop['body'].iloc[0].split()

In [None]:
new_falsespez = word_counter(false_spez).sort_values(by='word_num', ascending=False)

In [None]:
new_spez = word_counter(spez).sort_values(by='word_num', ascending=False)

In [None]:
new_spez

In [None]:
new_spez['word_num'].plot(kind='bar')

In [None]:
new_falsespez['word_num'].plot(kind='bar')

### Oberservation: 
all falsely attributed comments have 35 or less words. 
### Assumption: 
the bigger the word count per comment, the easier it will be to attribute it correctly. In this concrete case, it would reduce my falsely 'spez' attributed comments to limit allowed comment to a word count of > 35 words. 

I assume that this threshold of when a comment reliably can be classified as belonging to a certain author varies depending on several factors: comment corpus (more is better), generally more wordy, maybe even essay-like comment (more words per comment = better) and the same would hold true for another  