# Simple Logit Model Trained on Wiki Comments

In [44]:
import pandas as pd
import numpy as np
import logging
import warnings
import gspread
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('sample_clustered.csv', parse_dates=['timestamp'])
sample = df[((df.toxicity<0.33))] # only nontoxic comments
sample['last'].value_counts()

STAY       123508
ARCHIVE     65705
REMOVE       6571
Name: last, dtype: int64

In [8]:
sample = sample[sample['last']!='STAY'] # STAY indicates lack of user activities, did not mean approval
sample['removed'] = (sample['last'] == 'REMOVE').astype(int)

In [7]:
def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer


In [45]:
def run_model(field, feature_rep):
    training_data, testing_data = train_test_split(sample, random_state = 2000,)

    # GET LABELS
    Y_train=training_data['removed'].values
    Y_test=testing_data['removed'].values

    # GET FEATURES
    X_train,X_test,feature_transformer=extract_features(sample,field,training_data,testing_data,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=scikit_log_reg.fit(X_train,Y_train)
    
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    return (model, feature_transformer, 
            accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_test, Y_test_pred),
           recall_score(Y_train, Y_train_pred), recall_score(Y_test, Y_test_pred))


In [47]:
data = []
models = {}
transformers = {}

for field in ['cleaned_content', 'content']:
    for feature_rep in ['binary', 'count', 'tfidf']:
        model, transformer, accuracy_train, accuracy_test, recall_train, recall_test = run_model(field, feature_rep)
        models[(field, feature_rep)] = model
        transformers[(field, feature_rep)] = transformer
        data.append({'field': field, 'feature_rep': feature_rep, 
                     'P/R train': f'{round(accuracy_train, 3)}/{round(recall_train, 3)}',
                     'P/R test':  f'{round(accuracy_test, 3)}/{round(recall_test, 3)}'})
        
accuracy_df = pd.DataFrame(data)
accuracy_df

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

Unnamed: 0,field,feature_rep,P/R train,P/R test
0,cleaned_content,binary,0.982/0.799,0.893/0.151
1,cleaned_content,count,0.94/0.34,0.91/0.104
2,cleaned_content,tfidf,0.94/0.34,0.91/0.104
3,content,binary,0.99/0.901,0.896/0.194
4,content,count,0.946/0.407,0.911/0.121
5,content,tfidf,0.946/0.407,0.911/0.121


## Apply to SN

In [19]:
gc = gspread.service_account()

In [20]:
sh = gc.open("Smart News-GT-set 2 ( 100/1000 )- V3-FTE (12/10)")
for i, worksheet in enumerate(sh.worksheets()):
    print(i, worksheet.title)

0 Consensus
1 Sailaja
2 Tom
3 Owen
4 Xiaolin
5 Katie
6 parent comment


In [21]:
worksheet = sh.get_worksheet(0)
df = pd.DataFrame(worksheet.get_values()[4: 104])
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,4323245421758513930,,Meth’d up MAGA trash,2021-08-30 21:13:51,0,https://www.independent.co.uk/news/world/ameri...,Yes,Yes,Yes,Yes,...,4 - Grey area but leaning toward 5,5 - Remove,5 - Remove,REMOVE,REMOVE,,,,5 - Comment is unsubstantive,5 - Remove
1,4323245507347486702,,Lol I just made a comment it was removed for g...,2021-08-30 21:14:11,0,https://www.businessinsider.com/last-us-milita...,No,No,,No,...,1 - Don’t remove,5 - Remove,5 - Remove,REMOVE,REMOVE,,,,4 - Grey area but leaning toward 5,4 - Grey area but leaning toward 5
2,4323245612343493450,,Apparently you can’t say negative stuff about ...,2021-08-30 21:14:36,0,https://www.businessinsider.com/last-us-milita...,No,No,,No,...,1 - Don’t remove,4 - Grey area but leaning toward 5,1 - Don’t remove,KEEP,KEEP,,,,3 - Cannot decide (escalate for policy decision),3 - Cannot decide (escalate for policy decision)


In [22]:
df.tail(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
97,4323266321522364955,,Finally,2021-08-30 22:36:53,0,https://www.businessinsider.com/last-us-milita...,No,,No,No,...,1 - Don’t remove,5 - Remove,5 - Remove,REMOVE,REMOVE,,,,,
98,4323270149567162085,,Good riddance.,2021-08-30 22:52:06,0,https://www.mirror.co.uk/news/us-news/anti-mas...,Yes,,No,No,...,1 - Don’t remove,5 - Remove,5 - Remove,REMOVE,REMOVE,,,,,
99,4323271602872522914,,Weird science!,2021-08-30 22:57:53,0,https://www.cincinnati.com/story/news/2021/08/...,No,,No,No,...,1 - Don’t remove,5 - Remove,5 - Remove,REMOVE,REMOVE,,,,,


In [24]:
all(df[42]==df[43])

True

In [28]:
df[42].value_counts()

REMOVE    66
KEEP      34
Name: 42, dtype: int64

In [25]:
df[2]

0                                 Meth’d up MAGA trash 
1     Lol I just made a comment it was removed for g...
2     Apparently you can’t say negative stuff about ...
3                          She is meth’d up MAGA trash 
4                Rest In Peace GQP… it’s over for sure 
                            ...                        
95    Well, to be fair, many of us hate rand almost ...
96                                         Maga trash. 
97                                             Finally 
98                                      Good riddance. 
99                                       Weird science!
Name: 2, Length: 100, dtype: object

In [48]:
model = models[('content', 'binary')]
transformer = transformers[('content', 'binary')]

test_features = transformer.transform(df[2])
test_pred = model.predict(test_features)
y_true = (df[42]=='REMOVE').astype(int)

accuracy_score(test_pred, y_true), recall_score(test_pred, y_true)

(0.46, 0.7727272727272727)

In [39]:
df.iloc[0]

0                                   4323245421758513930
1                                                      
2                                 Meth’d up MAGA trash 
3                                   2021-08-30 21:13:51
4                                                     0
5     https://www.independent.co.uk/news/world/ameri...
6                                                   Yes
7                                                   Yes
8                                                   Yes
9                                                   Yes
10                                                  Yes
11                                                  Yes
12                                                     
13                                                     
14                                                     
15                                                     
16                                                     
17                                              

In [40]:
df[36].value_counts()

1 - Not unsubstantive                 50
5 - Comment is unsubstantive          49
2 - Grey area but leaning toward 1     1
Name: 36, dtype: int64

In [49]:
def is_positive(val):
    if len(val) > 0:
        return int(val.startswith('5'))
    return 0

y_true = df[36].map(is_positive)
accuracy_score(test_pred, y_true), recall_score(test_pred, y_true)

(0.55, 0.5909090909090909)