In [1]:
import pandas as pd
import numpy as np
import os

import sys
from tqdm import tqdm

import nltk

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
import os

from sklearn.metrics import recall_score, precision_score

sys.path.insert(0,'/home/roshansk/YelpAnalysis/Datasets/')

from createDataset import *

import spacy

%load_ext autoreload
%autoreload 2



In [2]:
nlp = spacy.load('/data2/link10/models/fasttext/en_fasttext_crawl')

In [31]:
help(getData)

Help on function getData in module createDataset:

getData(dataFolder, dataset, balanceTrain=True)
    Available options for dataset
        yelp
        yelp_subset
        yelp_1v5
        amazon_finefood
        amazon_finefood_subset
        amazon_toys
        amazon_toys_subset
        empathy
        nrc_joy
        nrc_sadness
        nrc_fear
        nrc_anger
        nrc_surprise



#### External Data 

In [58]:
dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

df = getExternalData(dataFolder, 'dialog')

# trainDf, valDf, testDf = getData(dataFolder,'amazon_finefood_subset')
# trainDf = trainDf[['text','label']]
# trainDf = trainDf.drop_duplicates()
# print(len(trainDf))

#### Loading Lexicon

In [52]:
lexicon = pd.read_csv('/home/roshansk/YelpAnalysis/Lexicons/LSTM/nyelp_subset_lstm_div.csv')

lexiconWords = set(lexicon.word.values)

lexiconMap = {}

for i in range(len(lexicon)):
    lexiconMap[lexicon.iloc[i]['word']] = lexicon.iloc[i]['score']

In [53]:
len(lexiconWords)

24923

### Evaluating Lexicon

In [7]:
def getLexicon(file):
    
    lexicon = pd.read_csv(file)
    
    if 'scores' in lexicon.columns:
        lexicon.rename({'scores':'score'},axis =1, inplace = True)

    lexiconWords = set(lexicon.word.values)

    lexiconMap = {}

    for i in range(len(lexicon)):
        lexiconMap[lexicon.iloc[i]['word']] = lexicon.iloc[i]['score']
        
    return lexiconWords, lexiconMap


def scoreText(text, lexiconWords, lexiconMap):
    
    score = 0
    
    
    doc = nlp(text.lower())
    tokenList = [token.text for token in doc]

    for token in tokenList:
        if token in lexiconWords:
            score += lexiconMap[token]
            
    return score/len(tokenList)
 
def evaluateLexicon(testDf, lexiconWords, lexiconMap, dataName, lexiconName):
    
    ### Getting lexicon scores for text
    scoreList = []

    for i in range(len(testDf)):
        score = scoreText(testDf.iloc[i]['text'].lower(), lexiconWords, lexiconMap)

        scoreList.append(score)
        
    testDf['score'] = scoreList
    
    ### Training model for classification
    model = LogisticRegression()
    X = testDf.score.values.reshape(-1,1)
    y = testDf.label
    
    
    ### Computing Metrics
    acc = np.round(np.mean(cross_val_score(model, X, y, cv=5, scoring='accuracy')),3)
    f1 = np.round(np.mean(cross_val_score(model, X, y, cv=5, scoring='f1')),3)
    precision = np.round(np.mean(cross_val_score(model, X, y, cv=5, scoring='average_precision')),3)
    auc = np.round(np.mean(cross_val_score(model, X, y, cv=5, scoring='roc_auc')),3)

#     print(f" ACC | F1 | Precision | AUC ")
    print(f" {dataName} , {lexiconName} , {acc} , {f1} , {precision} , {auc}")
    
    
def runExperiment(testDf, lexiconList,  dataName):
    
    
    for lexicon in lexiconList:
        lexiconWords, lexiconMap = getLexicon(lexicon)
        
        lexiconName = lexicon.replace("/home/roshansk/YelpAnalysis/Lexicons/",'')
        evaluateLexicon(trainDf, lexiconWords, lexiconMap, dataName, lexiconName)
        
#         print("-"*10)
        
    
    

In [None]:
# Test : Dialog  | Lexicon : Yelp_subset
evaluateLexicon(df, lexiconWords, lexiconMap)

### Experiments

In [8]:
lexiconList = []

for path, subdirs, files in os.walk('/home/roshansk/YelpAnalysis/Lexicons/'):
    for name in files:
        lexiconList.append(os.path.join(path, name))
        
lexiconList

['/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_sadness_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_subset_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/empathy_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_joy_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_baby_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_1v5_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_finefood_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_surprise_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_fear_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_anger_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_toys_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/FFN/NN_NRCJoy_Lexicon.csv',
 '/home/rosh

In [9]:
lexiconList = [
#  '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_sadness_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_subset_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/empathy_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_joy_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_baby_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/nyelp_1v5_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_finefood_fasttext_results.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_surprise_fasttext_results.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_fear_fasttext_results.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/SVM/nrc_anger_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/SVM/amazon_toys_fasttext_results.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/FFN/NN_NRCJoy_Lexicon.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/FFN/NN_Yelp_subset_Lexicon.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nrc_sadness_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nyelp_subset_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nyelp_1v5_lstm_div.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nrc_surprise_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/amazon_finefood_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/amazon_baby_lstm_div.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nrc_anger_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/amazon_toys_lstm_div.csv',
 '/home/roshansk/YelpAnalysis/Lexicons/LSTM/empathy_lstm_div.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nrc_fear_lstm_div.csv',
#  '/home/roshansk/YelpAnalysis/Lexicons/LSTM/nrc_joy_lstm_div.csv'
]

In [72]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

df = getExternalData(dataFolder, 'dialog')

runExperiment(df, lexiconList, dataName='dialog')

# trainDf, valDf, testDf = getData(dataFolder,'amazon_finefood_subset')
# trainDf = trainDf[['text','label']]
# trainDf = trainDf.drop_duplicates()
# print(len(trainDf))

 dialog , SVM/nyelp_fasttext_results.csv , 0.877 , 0.931 , 0.972 , 0.874
----------
 dialog , SVM/nyelp_subset_fasttext_results.csv , 0.867 , 0.926 , 0.968 , 0.851
----------
 dialog , SVM/empathy_fasttext_results.csv , 0.85 , 0.919 , 0.916 , 0.67
----------
 dialog , SVM/nrc_joy_fasttext_results.csv , 0.853 , 0.92 , 0.933 , 0.724
----------
 dialog , SVM/amazon_baby_fasttext_results.csv , 0.873 , 0.929 , 0.972 , 0.868
----------
 dialog , SVM/nyelp_1v5_fasttext_results.csv , 0.874 , 0.929 , 0.972 , 0.871
----------
 dialog , SVM/amazon_finefood_fasttext_results.csv , 0.891 , 0.938 , 0.98 , 0.904
----------
 dialog , SVM/amazon_toys_fasttext_results.csv , 0.873 , 0.929 , 0.972 , 0.868
----------
 dialog , FFN/NN_NRCJoy_Lexicon.csv , 0.85 , 0.919 , 0.92 , 0.676
----------
 dialog , FFN/NN_Yelp_subset_Lexicon.csv , 0.865 , 0.925 , 0.963 , 0.835
----------
 dialog , LSTM/nyelp_subset_lstm_div.csv , 0.851 , 0.92 , 0.938 , 0.742
----------
 dialog , LSTM/nyelp_1v5_lstm_div.csv , 0.851 , 0.9

In [6]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

trainDf, valDf, testDf = getData(dataFolder,'amazon_finefood_subset')
trainDf = trainDf[['text','label']]
trainDf = trainDf.drop_duplicates()
print(len(trainDf))

runExperiment(trainDf, lexiconList, dataName = 'amazon_finefood_subset')



7688
 amazon_finefood_subset , SVM/nyelp_fasttext_results.csv , 0.877 , 0.931 , 0.972 , 0.874
 amazon_finefood_subset , SVM/nyelp_subset_fasttext_results.csv , 0.867 , 0.926 , 0.968 , 0.851
 amazon_finefood_subset , SVM/empathy_fasttext_results.csv , 0.85 , 0.919 , 0.916 , 0.67
 amazon_finefood_subset , SVM/nrc_joy_fasttext_results.csv , 0.853 , 0.92 , 0.933 , 0.724
 amazon_finefood_subset , SVM/amazon_baby_fasttext_results.csv , 0.873 , 0.929 , 0.972 , 0.868
 amazon_finefood_subset , SVM/nyelp_1v5_fasttext_results.csv , 0.874 , 0.929 , 0.972 , 0.871
 amazon_finefood_subset , SVM/amazon_finefood_fasttext_results.csv , 0.891 , 0.938 , 0.98 , 0.904
 amazon_finefood_subset , SVM/amazon_toys_fasttext_results.csv , 0.873 , 0.929 , 0.972 , 0.868
 amazon_finefood_subset , FFN/NN_NRCJoy_Lexicon.csv , 0.85 , 0.919 , 0.92 , 0.676
 amazon_finefood_subset , FFN/NN_Yelp_subset_Lexicon.csv , 0.865 , 0.925 , 0.963 , 0.835
 amazon_finefood_subset , LSTM/nyelp_subset_lstm_div.csv , 0.851 , 0.92 , 0.938

In [9]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

trainDf, valDf, testDf = getData(dataFolder,'nrc_joy')
trainDf = trainDf[['text','label']]
trainDf = trainDf.drop_duplicates()
print(len(trainDf))

runExperiment(trainDf, lexiconList, dataName = 'nrc_joy')



16458
 nrc_joy , SVM/nyelp_fasttext_results.csv , 0.644 , 0.37 , 0.545 , 0.642
 nrc_joy , SVM/nyelp_subset_fasttext_results.csv , 0.64 , 0.327 , 0.529 , 0.623
 nrc_joy , SVM/empathy_fasttext_results.csv , 0.619 , 0.193 , 0.491 , 0.594
 nrc_joy , SVM/nrc_joy_fasttext_results.csv , 0.776 , 0.694 , 0.779 , 0.838
 nrc_joy , SVM/amazon_baby_fasttext_results.csv , 0.642 , 0.369 , 0.54 , 0.644
 nrc_joy , SVM/nyelp_1v5_fasttext_results.csv , 0.645 , 0.359 , 0.542 , 0.634
 nrc_joy , SVM/amazon_finefood_fasttext_results.csv , 0.648 , 0.403 , 0.552 , 0.659
 nrc_joy , SVM/amazon_toys_fasttext_results.csv , 0.642 , 0.369 , 0.54 , 0.644
 nrc_joy , FFN/NN_NRCJoy_Lexicon.csv , 0.774 , 0.693 , 0.792 , 0.844
 nrc_joy , FFN/NN_Yelp_subset_Lexicon.csv , 0.636 , 0.31 , 0.52 , 0.608
 nrc_joy , LSTM/nyelp_subset_lstm_div.csv , 0.606 , 0.027 , 0.411 , 0.501
 nrc_joy , LSTM/nyelp_1v5_lstm_div.csv , 0.604 , 0.024 , 0.443 , 0.582
 nrc_joy , LSTM/amazon_finefood_lstm_div.csv , 0.605 , 0.015 , 0.426 , 0.525
 nrc_j

In [6]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

trainDf, valDf, testDf = getData(dataFolder,'empathy')
trainDf = trainDf[['text','label']]
trainDf = trainDf.drop_duplicates()
print(len(trainDf))

runExperiment(trainDf, lexiconList, dataName = 'empathy')



1487
 empathy , SVM/nyelp_fasttext_results.csv , 0.546 , 0.446 , 0.533 , 0.546
 empathy , SVM/nyelp_subset_fasttext_results.csv , 0.549 , 0.467 , 0.541 , 0.556
 empathy , SVM/empathy_fasttext_results.csv , 0.718 , 0.704 , 0.777 , 0.789
 empathy , SVM/nrc_joy_fasttext_results.csv , 0.553 , 0.49 , 0.562 , 0.575
 empathy , SVM/amazon_baby_fasttext_results.csv , 0.526 , 0.414 , 0.535 , 0.543
 empathy , SVM/nyelp_1v5_fasttext_results.csv , 0.551 , 0.474 , 0.548 , 0.565
 empathy , SVM/amazon_finefood_fasttext_results.csv , 0.535 , 0.441 , 0.536 , 0.549
 empathy , SVM/amazon_toys_fasttext_results.csv , 0.526 , 0.414 , 0.535 , 0.543
 empathy , FFN/NN_NRCJoy_Lexicon.csv , 0.545 , 0.455 , 0.553 , 0.549
 empathy , FFN/NN_Yelp_subset_Lexicon.csv , 0.558 , 0.474 , 0.547 , 0.565
 empathy , LSTM/nyelp_subset_lstm_div.csv , 0.514 , 0.0 , 0.514 , 0.517
 empathy , LSTM/nyelp_1v5_lstm_div.csv , 0.507 , 0.1 , 0.508 , 0.532
 empathy , LSTM/amazon_finefood_lstm_div.csv , 0.514 , 0.053 , 0.495 , 0.49
 empath

In [7]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

trainDf, valDf, testDf = getData(dataFolder,'amazon_toys_subset')
trainDf = trainDf[['text','label']]
trainDf = trainDf.drop_duplicates()
print(len(trainDf))

runExperiment(trainDf, lexiconList, dataName = 'amazon_toys_subset')



8000
 amazon_toys_subset , SVM/nyelp_fasttext_results.csv , 0.939 , 0.968 , 0.989 , 0.874
 amazon_toys_subset , SVM/nyelp_subset_fasttext_results.csv , 0.941 , 0.969 , 0.987 , 0.853
 amazon_toys_subset , SVM/empathy_fasttext_results.csv , 0.938 , 0.968 , 0.966 , 0.67
 amazon_toys_subset , SVM/nrc_joy_fasttext_results.csv , 0.938 , 0.968 , 0.974 , 0.733
 amazon_toys_subset , SVM/amazon_baby_fasttext_results.csv , 0.942 , 0.97 , 0.993 , 0.911
 amazon_toys_subset , SVM/nyelp_1v5_fasttext_results.csv , 0.94 , 0.969 , 0.989 , 0.87
 amazon_toys_subset , SVM/amazon_finefood_fasttext_results.csv , 0.938 , 0.968 , 0.99 , 0.878
 amazon_toys_subset , SVM/amazon_toys_fasttext_results.csv , 0.942 , 0.97 , 0.993 , 0.911
 amazon_toys_subset , FFN/NN_NRCJoy_Lexicon.csv , 0.938 , 0.968 , 0.969 , 0.691
 amazon_toys_subset , FFN/NN_Yelp_subset_Lexicon.csv , 0.939 , 0.968 , 0.985 , 0.834
 amazon_toys_subset , LSTM/nyelp_subset_lstm_div.csv , 0.938 , 0.968 , 0.971 , 0.694
 amazon_toys_subset , LSTM/nyelp_1

In [11]:
### External Dataset

dataFolder = '/home/roshansk/YelpAnalysis/Datasets/'

trainDf, valDf, testDf = getData(dataFolder,'yelp_subset')
trainDf = trainDf[['text','label']]
trainDf = trainDf.drop_duplicates()
print(len(trainDf))

runExperiment(trainDf, lexiconList, dataName = 'yelp')



8000
 yelp , SVM/nyelp_fasttext_results.csv , 0.915 , 0.946 , 0.988 , 0.961
 yelp , SVM/nyelp_subset_fasttext_results.csv , 0.924 , 0.951 , 0.991 , 0.967
 yelp , SVM/empathy_fasttext_results.csv , 0.775 , 0.871 , 0.902 , 0.724
 yelp , SVM/nrc_joy_fasttext_results.csv , 0.791 , 0.878 , 0.917 , 0.773
 yelp , SVM/amazon_baby_fasttext_results.csv , 0.882 , 0.926 , 0.978 , 0.93
 yelp , SVM/nyelp_1v5_fasttext_results.csv , 0.908 , 0.941 , 0.988 , 0.959
 yelp , SVM/amazon_finefood_fasttext_results.csv , 0.891 , 0.931 , 0.98 , 0.936
 yelp , SVM/amazon_toys_fasttext_results.csv , 0.882 , 0.926 , 0.978 , 0.93
 yelp , FFN/NN_NRCJoy_Lexicon.csv , 0.774 , 0.87 , 0.895 , 0.709
 yelp , FFN/NN_Yelp_subset_Lexicon.csv , 0.912 , 0.944 , 0.988 , 0.96
 yelp , LSTM/nyelp_subset_lstm_div.csv , 0.832 , 0.898 , 0.944 , 0.84
 yelp , LSTM/nyelp_1v5_lstm_div.csv , 0.803 , 0.887 , 0.928 , 0.82
 yelp , LSTM/amazon_finefood_lstm_div.csv , 0.778 , 0.875 , 0.856 , 0.639
 yelp , LSTM/amazon_baby_lstm_div.csv , 0.778 ,

#### NN_yelp_subset

100%|██████████| 13538/13538 [00:04<00:00, 2764.31it/s]


 ACC | F1 | Precision | AUC 
 0.835 , 0.905 , 0.949 , 0.824


In [44]:
# Test : amazon_finefood_subset  | Lexicon : Yelp_subset
evaluateLexicon(trainDf, lexiconWords, lexiconMap)

100%|██████████| 8000/8000 [00:05<00:00, 1420.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


 ACC | F1 | Precision | AUC 
 0.864 , 0.924 , 0.963 , 0.835


In [51]:
# Test : amazon_toys_subset  | Lexicon : Yelp_subset
evaluateLexicon(trainDf, lexiconWords, lexiconMap)

100%|██████████| 8000/8000 [00:08<00:00, 992.56it/s] 


 ACC | F1 | Precision | AUC 
 0.939 , 0.968 , 0.985 , 0.834


#### nyelp_subset_lstm_div

In [59]:
# Test : Dialog  | Lexicon : nyelp_subset_lstm_div
evaluateLexicon(df, lexiconWords, lexiconMap)

100%|██████████| 13538/13538 [00:04<00:00, 2743.75it/s]


 ACC | F1 | Precision | AUC 
 0.816 , 0.899 , 0.912 , 0.698


In [56]:
# Test : amazon_finefood_subset  | Lexicon : nyelp_subset_lstm_div
evaluateLexicon(trainDf, lexiconWords, lexiconMap)

100%|██████████| 7688/7688 [00:04<00:00, 1666.48it/s]


 ACC | F1 | Precision | AUC 
 0.851 , 0.92 , 0.938 , 0.742


In [54]:
# Test : amazon_toys_subset  | Lexicon : nyelp_subset_lstm_div
evaluateLexicon(trainDf, lexiconWords, lexiconMap)

100%|██████████| 8000/8000 [00:07<00:00, 1000.43it/s]


 ACC | F1 | Precision | AUC 
 0.938 , 0.968 , 0.971 , 0.694


#### Setting Threshold values

In [63]:
pd.Series(scoreList).describe()

count    22076.000000
mean         3.641149
std          2.357135
min         -6.787894
25%          2.089142
50%          3.264231
75%          4.840550
max         25.217315
dtype: float64

In [69]:
#### Set the list of thresholds here

thresholdList = [-20,-10,0,3,4,5, 10,20,30]

# thresholdList = [-.3,-.2,-.1,0,.1,.2,.3,]

# thresholdList = [-20,-10,-.3,-.2,-.1,0,.1,.2,.3,10,20,30]

In [70]:
for threshold in thresholdList:
    
    df['pred'] = df.score.apply(lambda x : 1 if x> threshold else 0)
    
    acc = np.round(accuracy_score(df.label, df.pred),3)
    f1 = np.round(f1_score(df.label, df.pred, average = 'micro'),3)
    recall = np.round(recall_score(df.label, df.pred, average = 'micro'),3)
    precision = np.round(precision_score(df.label, df.pred, average = 'micro'),3)
    
    print(f"Threshold : {threshold:4} Acc : {acc:5}  F1 : {f1:5} Prec : {precision:5}  Recall : {recall:5}" )

Threshold :  -20 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5
Threshold :  -10 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5
Threshold :    0 Acc : 0.517  F1 : 0.517 Prec : 0.517  Recall : 0.517
Threshold :    3 Acc : 0.709  F1 : 0.709 Prec : 0.709  Recall : 0.709
Threshold :    4 Acc : 0.708  F1 : 0.708 Prec : 0.708  Recall : 0.708
Threshold :    5 Acc : 0.663  F1 : 0.663 Prec : 0.663  Recall : 0.663
Threshold :   10 Acc : 0.514  F1 : 0.514 Prec : 0.514  Recall : 0.514
Threshold :   20 Acc : 0.501  F1 : 0.501 Prec : 0.501  Recall : 0.501
Threshold :   30 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5


Threshold :  -20 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5
Threshold :  -10 Acc : 0.501  F1 : 0.501 Prec : 0.501  Recall : 0.501
Threshold :    0 Acc : 0.769  F1 : 0.769 Prec : 0.769  Recall : 0.769
Threshold :   10 Acc : 0.503  F1 : 0.503 Prec : 0.503  Recall : 0.503
Threshold :   20 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5
Threshold :   30 Acc :   0.5  F1 :   0.5 Prec :   0.5  Recall :   0.5


In [14]:
os.getcwd()

'/home/roshansk/YelpAnalysis/ExternalData_Evaluation'

In [37]:
df.label.value_counts()

1    11038
0     2500
Name: label, dtype: int64

In [31]:
df['pred'] = df.score.apply(lambda x : 1 if x> threshold else 0)

In [32]:
df.head()

Unnamed: 0,text,label,score,pred
0,That's a good idea . I hear Mary and Sally oft...,1,-0.04638,0
1,"Sounds great to me ! If they are willing , we ...",1,0.000971,0
2,Good.Let ' s go now .,1,0.088895,0
3,All right .,1,0.016254,0
7,I ’ m looking at my horoscope for this month !...,1,0.153852,0


In [66]:
df.score.describe()

count    22076.000000
mean         3.641149
std          2.357135
min         -6.787894
25%          2.089142
50%          3.264231
75%          4.840550
max         25.217315
Name: score, dtype: float64

In [14]:
help(precision_score)

Help on function precision_score in module sklearn.metrics._classification:

precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
    Compute the precision
    
    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
    true positives and ``fp`` the number of false positives. The precision is
    intuitively the ability of the classifier not to label as positive a sample
    that is negative.
    
    The best value is 1 and the worst value is 0.
    
    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : list, optional
        The set of labels to include when ``average != 'binary