In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import binarize
from sklearn.metrics import classification_report, roc_auc_score, log_loss, mean_squared_error, brier_score_loss

In [None]:
test_baseline = pd.read_csv("data/predict_and_result/flocks_test_data_no_nans.csv") # Multiclass Test Baseline
test_result = pd.read_csv("data/predict_and_result/flocks_test_result.csv") # Multiclass Test Prediction Results

In [None]:
y_true = test_baseline.drop(columns=["id", "comment_text"])
columns = y_true.columns.to_list()
y_true_binary = pd.DataFrame(binarize(y_true, threshold = 0.5), columns=columns)

In [None]:
y_pred_prob = test_result.drop(columns=["Unnamed: 0"])
columns = y_pred_prob.columns.to_list()
y_pred_binary = pd.DataFrame(binarize(y_pred_prob, threshold = 0.39), columns=columns)

In [None]:
print("'toxic' brier score (non-binarized):", brier_score_loss(y_true[["toxic"]], y_pred_prob[["toxic"]], sample_weight=None, pos_label=None))
print("roc_auc_score (non-binarized):", roc_auc_score(y_true_binary, y_pred_prob))
print("roc_auc_score (binarized):", roc_auc_score(y_true_binary, y_pred_binary))
print("log_loss", log_loss(y_true_binary, y_pred_prob, eps=1e-15, normalize=True, sample_weight=None))

In [None]:
mean_squared_error(y_true[["toxic"]], y_pred_prob[["toxic"]], sample_weight=None, multioutput='uniform_average')

In [None]:
mean_squared_error(y_true, y_pred_prob, sample_weight=None, multioutput="raw_values")

## Import and Clean Data

In [None]:
# IMPORT FROM CSV's
df = pd.read_csv('data/hn_all_w_sentiment_cleaned_inplace.csv')
print(df.shape)

In [None]:
df.info()

In [None]:
df = df[['commentor',
 'comment_time',
 'commentid',
 'parentid',
 'author',
 'story_time',
 'parent_type',
 'cleaned_comment',
 'cleaned_title',
 'comment_polarity',
 'comment_subjectivity']]

In [None]:
df_a = pd.read_csv("data/predict_and_result/result_1_of_2.csv")
df_b = pd.read_csv("data/predict_and_result/result2_of_2.csv")

In [None]:
df_a = pd.concat([df_a, df_b])
del(df_b)

In [None]:
df_a.reset_index(drop = True, inplace=True)

In [None]:
df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]]= df_a[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]]
del(df_a)
df.info()

In [None]:
df_floats = df.select_dtypes(include=['float']).columns.to_list()
df_floats
df_float = df.select_dtypes(include=['float'])
df_float = df_float.apply(pd.to_numeric,downcast='float')
df[df_floats] = df_float[df_floats]
del(df_float)

In [None]:
df_ints = df.select_dtypes(include=['int']).columns.to_list()
df_int = df.select_dtypes(include=['int'])
df_int = df_int.apply(pd.to_numeric,downcast='unsigned')
df[df_ints] = df_int[df_ints]
del(df_int)

In [None]:
df.to_csv("data/predict_and_result/data_and_results_w_selected_fields.csv", index = False)

In [None]:
df.head()

In [None]:
%%time
df = df.fillna(value=0)
df = df.rename(columns={'author': 'parent_author', 
                        'cleaned_title': 'parent_title',
                        'score': 'parent_score', 
                        'story_time': 'parent_time', 
                        'ranking': 'comment_rank',
                        'commentid':'comment_id',
                        'parentid':'parent_id'})
df = df.sort_values(by = ["toxic",'comment_subjectivity','comment_polarity'])
display(df.head(3))
display(df.tail(3))

In [None]:
%%time
df['old_saltiness'] = (df['comment_polarity'].multiply(df['comment_subjectivity']))
df['comment_saltiness'] = (df['toxic'].multiply(-1).add(.5).multiply(2))

toxic_w = 1
s_toxic_w = 10
threat_w = 8

# Function = for 


df['is_salty'] = (df['comment_saltiness'].map(lambda x: True if (x < 0) else False))
df['is_severe_toxic'] = (df['severe_toxicity'].map(lambda x: True if (x > .5) else False))
df['is_obscene'] = (df['obscene'].map(lambda x: True if (x > .5) else False))
df['is_identity_attack'] = (df['identity_attack'].map(lambda x: True if (x > .5) else False))
df['is_insult'] = (df['insult'].map(lambda x: True if (x > .5) else False))
df['is_threat'] = (df['threat'].map(lambda x: True if (x > .5) else False))

In [2]:
df = pd.read_csv("data/predict_and_result/saving_work.csv").sort_values(by = ['severe_toxicity','obscene','toxic','threat'], ascending=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df = df.sort_values(by = ['severe_toxicity','obscene','toxic','threat'], ascending=False)

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
df[["comment_id", "cleaned_comment"]].to_csv("data/predict_and_result/hail_mary.csv", index = False)

In [5]:
def print_full(x):
    x = x.head(300)
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    display(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')
    
print_full(df)

Unnamed: 0,commentor,comment_time,comment_id,parent_id,parent_author,parent_time,parent_type,cleaned_comment,parent_title,comment_polarity,comment_subjectivity,toxic,severe_toxicity,obscene,identity_attack,insult,threat,old_saltiness,comment_saltiness,is_salty,is_severe_toxic,is_obscene,is_identity_attack,is_insult,is_threat,comment_JSON
103498,angersock,1320249925,3187426,3187113,rwl,1320246800.0,comment,"With all due respect... ""I think this is a completely misguided assumption. Maybe this is true these days. But it wasn't always true, and it shouldn't be true."" What the past state was does not matter a great deal in these issues--today's problems are what stand between us and the food on our tables. If you read that economic growth as not being responsible for funding of higher education, you are similarly incorrect. At the national level it's about having strong soft power and at the personal level about it's about ""keeping up with the Joneses"" re: income. ""Higher education, especially in the liberal arts, is for training students to become good community members, citizens and leaders."" It was my impression that high schools and parents were the mechanism for indoctrinating youths into community values. Moreover, without playing too hard to personal experience, if my classmates at uni are tomorrow's leaders I'm a great deal concerned. ""It's about giving them the cognitive, rhetorical, and even emotional skills required to solve the hard problems that we face as a society. And there is a lot more to solving those problems than promoting ""innovation"" or ""economic growth""."" What are these ""hard problems""? Hunger? Disease? Working boring jobs to shuffle little bits of paper around? I'd wager that the root of your ""hard problems"" is ultimately scarcity in one form or another, and the only way to address that is through technology, and the only way to get that is through STEM training. ""We subsidize higher education because producing citizens with those skills is valuable to our society."" You echo your given quote here--you instead handwave the ""skills"" involved whereas the quote specifically suggests that economic growth is the valuable criterion to pick skills with. ""Are liberal arts programs perfect? No. Do they cost too much? Maybe. Would it be good for society to have more people going into STEM fields? Probably. Does all this mean that liberal arts education is ""oversold""? Certainly not."" I agree, disagree, agree, and disagree again. Liberal arts programs seem to consist primarily of a great deal of reading and research, augmented with the professional musings of an elder in the field. Is my impression incorrect? If it is not, is there a particular reason why it costs so much when that material is so cheaply available? I'd appreciate clarification on this matter. ""Indeed, it is rare to see people express just how valuable the liberal arts are, and why."" You know, there is at least one simple explanation that suggests itself for explaining this phenomena...",Another Comment,0.15,0.55,0.94,0.67,0.94,0.68,0.9,0.22,0.08,-0.88,True,True,True,True,True,False,"{'commentor': 'angersock', 'comment_time': 1320249925, 'comment_saltiness': -0.8782680034637451, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'rwl', 'parent_title': 'Another Comment', 'cleaned_comment': 'With all due respect... ""I think this is a completely misguided assumption. Maybe this is true these days. But it wasn\'t always true, and it shouldn\'t be true."" What the past state was does not matter a great deal in these issues--today\'s problems are what stand between us and the food on our tables. If you read that economic growth as not being responsible for funding of higher education, you are similarly incorrect. At the national level it\'s about having strong soft power and at the personal level about it\'s about ""keeping up with the Joneses"" re: income. ""Higher education, especially in the liberal arts, is for training students to become good community members, citizens and leaders."" It was my impression that high schools and parents were the mechanism for indoctrinating youths into community values. Moreover, without playing too hard to personal experience, if my classmates at uni are tomorrow\'s leaders I\'m a great deal concerned. ""It\'s about giving them the cognitive, rhetorical, and even emotional skills required to solve the hard problems that we face as a society. And there is a lot more to solving those problems than promoting ""innovation"" or ""economic growth""."" What are these ""hard problems""? Hunger? Disease? Working boring jobs to shuffle little bits of paper around? I\'d wager that the root of your ""hard problems"" is ultimately scarcity in one form or another, and the only way to address that is through technology, and the only way to get that is through STEM training. ""We subsidize higher education because producing citizens with those skills is valuable to our society."" You echo your given quote here--you instead handwave the ""skills"" involved whereas the quote specifically suggests that economic growth is the valuable criterion to pick skills with. ""Are liberal arts programs perfect? No. Do they cost too much? Maybe. Would it be good for society to have more people going into STEM fields? Probably. Does all this mean that liberal arts education is ""oversold""? Certainly not."" I agree, disagree, agree, and disagree again. Liberal arts programs seem to consist primarily of a great deal of reading and research, augmented with the professional musings of an elder in the field. Is my impression incorrect? If it is not, is there a particular reason why it costs so much when that material is so cheaply available? I\'d appreciate clarification on this matter. ""Indeed, it is rare to see people express just how valuable the liberal arts are, and why."" You know, there is at least one simple explanation that suggests itself for explaining this phenomena...', 'comment_id': 3187426, 'parent_id': 3187113}"
90562,kardos,1522107001,16683693,16683642,eggpy,1522106600.0,comment,Revenue,Another Comment,0.0,0.0,0.94,0.59,0.85,0.59,0.84,0.65,0.0,-0.89,True,True,True,True,True,True,"{'commentor': 'kardos', 'comment_time': 1522107001, 'comment_saltiness': -0.8875666856765747, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': True, 'parent_type': 'comment', 'parent_author': 'eggpy', 'parent_title': 'Another Comment', 'cleaned_comment': 'Revenue', 'comment_id': 16683693, 'parent_id': 16683642}"
169311,teemo_cute,1395723711,7463926,7463671,snupples,1395718700.0,story,I love Java because when you add 'script' to it you have a pretty sweet language.,Why I like Java,0.37,0.75,0.94,0.57,0.86,0.77,0.9,0.2,0.28,-0.88,True,True,True,True,True,False,"{'commentor': 'teemo_cute', 'comment_time': 1395723711, 'comment_saltiness': -0.8775883913040161, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'story', 'parent_author': 'snupples', 'parent_title': 'Why I like Java', 'cleaned_comment': ""I love Java because when you add 'script' to it you have a pretty sweet language."", 'comment_id': 7463926, 'parent_id': 7463671}"
265894,barrkel,1340798810,4166507,4165644,phillmv,1340774400.0,comment,"This is really, really not true. Some of (perhaps even most of) the best engineers I've worked with had almost zero public profile, almost to the point of being publicity phobic.",Another Comment,0.33,0.38,0.92,0.57,0.86,0.69,0.83,0.21,0.13,-0.83,True,True,True,True,True,False,"{'commentor': 'barrkel', 'comment_time': 1340798810, 'comment_saltiness': -0.8332148790359497, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'phillmv', 'parent_title': 'Another Comment', 'cleaned_comment': ""This is really, really not true. Some of (perhaps even most of) the best engineers I've worked with had almost zero public profile, almost to the point of being publicity phobic."", 'comment_id': 4166507, 'parent_id': 4165644}"
251626,agentdrtran,1518128633,16336136,16335277,trav4225,1518121300.0,comment,"yeah, but all of these things exist becuase of the sheer amount of spam that gets sent to people.",Another Comment,0.0,0.75,0.91,0.57,0.91,0.6,0.85,0.26,0.0,-0.81,True,True,True,True,True,False,"{'commentor': 'agentdrtran', 'comment_time': 1518128633, 'comment_saltiness': -0.8105853796005249, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'trav4225', 'parent_title': 'Another Comment', 'cleaned_comment': 'yeah, but all of these things exist becuase of the sheer amount of spam that gets sent to people.', 'comment_id': 16336136, 'parent_id': 16335277}"
29701,boothead,1450174002,10736872,10736714,matco11,1450170900.0,story,"It's great to read things like this... I wish my GP would do the same. I recently had a really high cholesterol reading (both total cholesterol and LDL). Everything else (blood pressure, blood glucose etc) seems fine. I'm 38, fit and healthy and nothing that suspect in my family history. I found the attitude of my doctor in all this quite surprising. It amounted basically to ""You definitely have familial hypercholesterolemia. There is no other possible option here other than statins"". No further questions about what I eat, my stress levels, lifestyle - nothing. What disappoints me the most here is now that I feel like it's all on me to determine what my real risk levels are and what's appropriate to treat this. I don't subscribe to the mainstream NHS view still heavily pushed that eating saturated fat -> high cholesterol == unhealthy as I think it's a lot more complicated (as this article shows). I don't like being is this situation as I'm as susceptible to human bias as the next person, and I'm not a doctor, however almost all of the high quality, science based writing I've read indicates that the mainstream healthcare system's view on cholesterol is wrong.",Inflammation: Medicine's burning question,0.13,0.51,0.94,0.56,0.57,0.68,0.81,0.74,0.06,-0.88,True,True,True,True,True,True,"{'commentor': 'boothead', 'comment_time': 1450174002, 'comment_saltiness': -0.8793870210647583, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': True, 'parent_type': 'story', 'parent_author': 'matco11', 'parent_title': ""Inflammation: Medicine's burning question"", 'cleaned_comment': 'It\'s great to read things like this... I wish my GP would do the same. I recently had a really high cholesterol reading (both total cholesterol and LDL). Everything else (blood pressure, blood glucose etc) seems fine. I\'m 38, fit and healthy and nothing that suspect in my family history. I found the attitude of my doctor in all this quite surprising. It amounted basically to ""You definitely have familial hypercholesterolemia. There is no other possible option here other than statins"". No further questions about what I eat, my stress levels, lifestyle - nothing. What disappoints me the most here is now that I feel like it\'s all on me to determine what my real risk levels are and what\'s appropriate to treat this. I don\'t subscribe to the mainstream NHS view still heavily pushed that eating saturated fat -> high cholesterol == unhealthy as I think it\'s a lot more complicated (as this article shows). I don\'t like being is this situation as I\'m as susceptible to human bias as the next person, and I\'m not a doctor, however almost all of the high quality, science based writing I\'ve read indicates that the mainstream healthcare system\'s view on cholesterol is wrong.', 'comment_id': 10736872, 'parent_id': 10736714}"
292584,daniel-cussen,1353942760,4832242,4830941,rdl,1353912000.0,comment,"Move the port out past the seawall, move the hipsters in to the abandoned industrial flats.",Another Comment,-0.25,0.25,0.94,0.56,0.82,0.77,0.89,0.19,-0.06,-0.87,True,True,True,True,True,False,"{'commentor': 'daniel-cussen', 'comment_time': 1353942760, 'comment_saltiness': -0.8740483522415161, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'rdl', 'parent_title': 'Another Comment', 'cleaned_comment': 'Move the port out past the seawall, move the hipsters in to the abandoned industrial flats.', 'comment_id': 4832242, 'parent_id': 4830941}"
180877,Luc,1451489418,10812405,10812123,vilius,1451485300.0,comment,"> It kind of makes sense that reading + writing + listening allows one to deeply absorb content and probably is the best combination so far. This is probably one of the best ideas I read on HN this year. It just seems to match up very well with my own study experiences, and the advice I have read recently in 'A Mind for Numbers' Thanks, I shall try this!",Another Comment,0.42,0.52,0.91,0.55,0.87,0.68,0.84,0.33,0.22,-0.82,True,True,True,True,True,False,"{'commentor': 'Luc', 'comment_time': 1451489418, 'comment_saltiness': -0.8181418180465698, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'vilius', 'parent_title': 'Another Comment', 'cleaned_comment': ""> It kind of makes sense that reading + writing + listening allows one to deeply absorb content and probably is the best combination so far. This is probably one of the best ideas I read on HN this year. It just seems to match up very well with my own study experiences, and the advice I have read recently in 'A Mind for Numbers' Thanks, I shall try this!"", 'comment_id': 10812405, 'parent_id': 10812123}"
297642,bluecalm,1419870509,8810406,8809800,ebbv,1419862300.0,comment,It's really not easy to guess what you meant as you can see from number of people who responded to your comment with their guesses. Maybe you don't a function call or maybe like semi-colons or you don't like double quotes for a string...,Another Comment,-0.11,0.42,0.9,0.55,0.86,0.68,0.77,0.39,-0.05,-0.81,True,True,True,True,True,False,"{'commentor': 'bluecalm', 'comment_time': 1419870509, 'comment_saltiness': -0.8099615573883057, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': True, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'ebbv', 'parent_title': 'Another Comment', 'cleaned_comment': ""It's really not easy to guess what you meant as you can see from number of people who responded to your comment with their guesses. Maybe you don't a function call or maybe like semi-colons or you don't like double quotes for a string..."", 'comment_id': 8810406, 'parent_id': 8809800}"
100029,tbrake,1387174783,6913199,6913115,vinhboy,1387172600.0,comment,"IMO we're past the point of the layperson being so technically hopeless they'd swallow this whole. I grant the numbers are still out there, but I think the people convinced/reassured/calmed by this kind of PR are that way because they genuinely think it's worth it. Maybe even a ""better the devil you know"" scenario?; it's worth ""our guys"" having their hands on all of this power, with its potential for abuse, than the ""other guys"". What I don't get is why, if you're the NSA, would you go through the PR effort for people already on ""your side""; you're not going to sway a large swath of the population and there surely can't be that many fence sitters on this issue.",Another Comment,0.24,0.46,0.93,0.55,0.92,0.25,0.84,0.39,0.11,-0.86,True,True,True,False,True,False,"{'commentor': 'tbrake', 'comment_time': 1387174783, 'comment_saltiness': -0.8597065210342407, 'is_salty': True, 'is_severe_toxic': True, 'is_obscene': True, 'is_identity_attack': False, 'is_insult': True, 'is_threat': False, 'parent_type': 'comment', 'parent_author': 'vinhboy', 'parent_title': 'Another Comment', 'cleaned_comment': 'IMO we\'re past the point of the layperson being so technically hopeless they\'d swallow this whole. I grant the numbers are still out there, but I think the people convinced/reassured/calmed by this kind of PR are that way because they genuinely think it\'s worth it. Maybe even a ""better the devil you know"" scenario?; it\'s worth ""our guys"" having their hands on all of this power, with its potential for abuse, than the ""other guys"". What I don\'t get is why, if you\'re the NSA, would you go through the PR effort for people already on ""your side""; you\'re not going to sway a large swath of the population and there surely can\'t be that many fence sitters on this issue.', 'comment_id': 6913199, 'parent_id': 6913115}"


In [None]:
%%time
# Should see spectrum from -1 to 1, and saltiness -1 to 0 (from diminishing effect of objectivity). 
df = df.sort_values(by = ['toxic', 'severe_toxicity'], ascending=False)
display(df.iloc[:,-6:].head(3))
display(df.iloc[:,-6:].tail(3))

### Normalize comment saltiness from sub `-1 to 1` to obj. Create booleans for +/- classes. 

In [None]:
df.head()
df = df.rename(columns={'author': 'parent_author', 
                        'cleaned_title': 'parent_title',
                        'score': 'parent_score', 
                        'story_time': 'parent_time', 
                        'ranking': 'comment_rank',
                        'commentid':'comment_id',
                        'parentid':'parent_id'})

In [None]:
df_salty = df[df['is_salty'] == True].copy()

In [None]:
df_happy =  df[df['is_salty'] == False].copy()
del(df)

In [None]:
df_happy["comment_JSON"] = np.nan

In [None]:
%%time
import json
def create_comment_JSON_records(df):
    """Turns comments + stats into json objects, creates column in given df.
    
    Saves filtered dataframe columns as json object oriented on row records. 
    Decodes the JSON string into a list containing 1 JSON object per row.
    Adds new column in the given dataframe that stores the row's JSON Object.
    
    Args:
        df: The full comment dataframe.    
    """
    df['comment_JSON'] = (df[['commentor', 'comment_time', 'comment_saltiness',
                 "is_salty", "is_severe_toxic", "is_obscene", 
                 "is_identity_attack","is_insult","is_threat",
                 'parent_type', 'parent_author', 'parent_title',
                 'cleaned_comment', 'comment_id', 'parent_id']]
             .to_dict(orient='records'))
    print( "JSON Uploaded")


create_comment_JSON_records(df_salty)

In [None]:
df_salty.head()

In [None]:
data = pd.concat([df_salty, df_happy])

In [None]:
del(df_salty)
del(df_happy)

In [None]:
data.shape

In [None]:
data.to_csv("data/predict_and_result/saving_work.csv", index = False)

### Calc some stats

In [None]:
def commentor_stats(df):
    """Returns stats about the commentor's comment history 
    
    Groups by `commentor` and calculates agg stats for 'count',`min`, `max`.
    
    Columns Created:
        `count_comments` - count the number of comments.
        `time_of_last_comment`  - Unix Epoch time of the last comment before our
                                  data was pulled on Mar 16, 2019, 12:24:46 AM.
        `time_of_first_comment` - Unix Epoch time of the earliest comment. 
    
    Args:
        df: The full comment dataframe.

    Returns: 
        out: A dataframe with index `commentor` and created columns.
    """
    out = (df.groupby('commentor', as_index=False)['comment_time']
           .agg(['count','max','min']))
    out = out.rename({'count': 'count_comments',
                      'max': 'time_of_last_comment',
                      'min': 'time_of_first_comment'}, axis='columns')
    print("Calculated commentor stats.")
    return out


# # Run Function & create df_ct
# df_ct = commentor_stats(data)

In [None]:
df_ct = commentor_stats(data)
df_ct.head()

### Calculate Commentor `count comments` & `first/latest` comment dates. 

Also create Dataframe_Commentor_Table, `df_ct`.

In [None]:
def calculate_monthly_summaries(df):
    """Creates summary of stats over `commentors` history by month for graphing.
    
    Calculates the `count` and `sum` aggregated stats of `comment_saltiness`
    grouped by `is_salty` & `month_text`. 
    Formats the stats into a JSON object for each commenters' period.
    Concatenates JSON Objects into a sequential sparse list (no empty months) 
    for each commentor. 
    
    Stats in the `montly_plot` list are:
        y_m: Year-Month period of stat aggregation from the `month-text` group.
        c_h: Stat, count of Happy Comments for the month.
        c_s: Stat, count of Salty Comments for the month.
        t_h: Stat, total (sum) of Happy Comment Scores for the month.
        t_s: Stat, total (sum) of Salty Comment Scores for the month.
    
    Args:
        df: The full comment dataframe.

    Returns: 
        out: A dataframe with index `commentor`, and a column `monthly_plot`. 
    """
    df['month_text'] = (pd.to_datetime(df['comment_time'],unit='s')
                        .dt.strftime('%Y_%m')).str[-5:]
    
    out = df['comment_saltiness'].groupby([df['commentor'], 
                                           df['month_text'],
                                           df['is_salty']]
                                         ).agg(['count','sum']).unstack()
    
    out.columns = [''.join(str(col)).strip() for col in out.columns.values]
    out = out.rename({"('count', False)": 'c_h', # Count Happy 
                      "('count', True)": 'c_s', # Count Salty
                      "('sum', False)": 't_h', # Sum Happy
                      "('sum', True)": 't_s'}, axis='columns') # Sum Salty 
    print("Calculated monthly stats")
    
    # Combine the monthly_stats into an object. 
    out.reset_index(inplace=True)
    out = out.rename({"month_text": 'y_m'},axis='columns')
    out = out.fillna(0.0)
    out["t_h"] = out["t_h"].round(decimals=2)
    out["t_s"] = out["t_s"].round(decimals=2)
    out['monthly_graph'] = (out[["y_m","t_s","t_h","c_s","c_h"]].to_dict(orient='records'))
    
    # Combine the montly_stats_object into a list for each commentor.
    out.sort_values(['commentor','y_m'], ascending=[True, True])
    keys, values = out[['commentor', 'monthly_graph']].values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values,index[1:])
    df = pd.DataFrame(data = {'monthly_plot':[list(a) for a in arrays]},
                      index = ukeys)
    print("Created monthly stat lists.")  
    return df


# Run Function & Merge into df_ct
df_ct = pd.merge(df_ct, calculate_monthly_summaries(data),
                 left_index=True, right_index=True, how='left')
df_ct.head()

### Create the 50 `top_cmnts_s `list for each Commentor - Filter by `is_salty`

In [None]:
def top_salty_comments(df):
    """Creates list object for each `commentor` of top 50 saltiest comments. 
    
    Filters by `is_salty` = True.
    Sorts values by `comment_saltiness` from the most salty (lowest value).
    Groups dataframe by `commentor'.
    Concatenates top 50 `comment_JSON` comment objects into a list object.
    Creates a new column from the list of obj(commentor's list of json objects).
    
    Args:
        df: The full comment dataframe.

    Returns: 
        df: A dataframe w/ index `commentor` and a column 'top_cmnts_s'.
    """
    # Grab the right comments, pulls up to 50 comments by saltiest.
    df = df[df['is_salty'] == True]
    df = df.sort_values(['commentor','comment_saltiness'],
                        ascending=[True, True])
    df = (df[['commentor','comment_JSON']].groupby(df['commentor']).head(50)
          .reset_index(drop=True))
    
    # Group the comments into a list for each user.
    keys, values = df.values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values,index[1:])
    df = pd.DataFrame(data = {'top_cmnts_s':[list(a) for a in arrays]},
                      index = ukeys)
    print("Grabbed the SALTIEST comments.")
    return df   


# Run Function & Merge into df_ct
df_ct = pd.merge(df_ct, top_salty_comments(data),
                 left_index=True, right_index=True, how='left')

### Get the `top_salty_comment` for each Commentor - Need it for `Rank` Lists

In [None]:
def the_top_salty_comment(df):
    """Returns the top salty comment of each `commentor`. 
    
    Filters by `is_salty`
    Sorts on `commentor` and `comment_saltiness` to bring saltiest to top.
    Groups dataframe by `commentor`. 
    Creates a list containing the top comment for each `commentor`. 
    Turns the list into a new column: `top_salty_comment`
    
    Args:
        df: The full comment dataframe.

    Returns: 
        df: A dataframe w/ index `commentor` and column `top_salty_comment`.
    """
    # Grab the right comments, will pull the top salty comment.
    df = df[df['is_salty'] == True]
    df = df.sort_values(['commentor','comment_saltiness'],
                        ascending=[True, True])
    df = (df[['commentor','comment_JSON']].groupby(df['commentor']).head(1)
          .reset_index(drop=True))
    
    # Group the comments into a list for each user.
    keys, values = df.values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values,index[1:])
    df = pd.DataFrame(data = {'top_salty_comment':[list(a) for a in arrays]},
                      index = ukeys)
    print("Grabbed the top SALTIEST comment.")
    return df   


# Run Function & Merge into df_ct
df_ct = pd.merge(df_ct, the_top_salty_comment(data),
                 left_index=True, right_index=True, how='left')

### Calculate stats for Saltiness - `Overall`

In [None]:
def saltiness_stats(df):
    """Creates stats of `comment_saltiness` overall.
    
    First groups dataframe by `commentor`. 
    Aggregates `count`, `sum`, & `mean` stats of `comment_saltiness` by `is_salty`.
    Creates new column for each aggregate stat: 4 new columns.
    
    Args:
        df: The full comment dataframe.

    Returns: 
        out: A dataframe with index `commentor`, and a column for each agg.stat.
    """
    out = (df.groupby('commentor', as_index=False)['comment_saltiness']
           .agg(['sum', 'mean', 'min', 'max']))
    out = out.rename({'sum': 'sum_slt_oall',
                      'mean': 'average_slt_oall',
                      'min': 'min_slt_oall',
                      'max': 'max_slt_oall'}, axis='columns')
    print("Calculated saltiness overall stats.")
    return out


# Run Function & Merge into df_ct
df_ct = pd.merge(df_ct, saltiness_stats(data), left_index=True,
                 right_index=True, how='left')

### Calculate stats for Saltiness - Split `Happy/Salty`

In [None]:
def saltiness_stats_split(df):
    """Creates stats of `comment_saltiness` by `is_salty`. 
    
    First groups dataframe by `commentor`. 
    Aggregates `count`, `sum`, & `mean` stats of `comment_saltiness` by `is_salty`. 
    Creates new column for each aggregate stat: 6 new columns. 
    
    Args:
        df: The full comments dataframe.

    Returns: 
        out: A dataframe with index `commentor`, and a column for each agg.stat.
    """
    out = (df['comment_saltiness'].groupby([df['commentor'],df['is_salty']])
           .agg(['count','sum', 'mean']).unstack())
    out.columns = [''.join(str(col)).strip() for col in out.columns.values]
    out = out.rename({"('count', False)": 'cnt_slt_h',
                      "('count', True)": 'cnt_slt_s',
                      "('sum', False)": 'sum_slt_h',
                      "('sum', True)": 'sum_slt_s',
                      "('mean', False)":"avg_slt_h",
                      "('mean', True)":"avg_slt_s"},
                     axis='columns')
    print("Calculated saltiness grouped stats - split by salty/happy.")
    return out


# Run Function & Merge into df_ct
df_ct = pd.merge(df_ct, saltiness_stats_split(data), left_index=True,
                 right_index=True, how='left')

In [None]:
df_ct.head()

In [None]:
df_ct.to_csv("data/predict_and_result/df_ct_saving.csv", index = False)

In [None]:
hn_cs = df_ct.copy()

### Create Ranking Columns for AMT of Salt Contributed Rank, Qty of Salty Comments Rank, Overall_Saltiest_Rank, & Saltiest_Trolls_Rank

In [None]:
def rank_sum_lifetime_amount(df):
    """ Ranks all commentors by the sum of their total salt contributed. 
    
    Sorts by the sum of salty comment scores `sum_slt_s` from lowest to highest.
    More negative (lower) = more salty.
    Assigns a rank based on position after sorting. 
    Creates a new column for the rank. 
    
    Args:
        df: The commentor_summary dataframe.
        
    Returns: 
        out: A dataframe with index `commentor` and column `rank_lt_amt_slt`.
    """
    out = (df[df['sum_slt_s'] < 0].sort_values(by=['sum_slt_s']))
    out["rank_lt_amt_slt"] = (out.sum_slt_s.rank(axis=0, method='first'))
    out = out["rank_lt_amt_slt"]
    print("Created rank_sum_lifetime_amount.")
    return out


hn_cs = pd.merge(hn_cs, rank_sum_lifetime_amount(hn_cs),left_index=True, 
                 right_index=True, how='left')


def rank_sum_lifetime_qty(df):
    """Rank all commentors on the quantity of salty comments contributed. 
    
    Sorts by the count of salty comments `cnt_slt_s` from highest to lowest. 
    Assigns a rank based on position after sorting. 
    Creates a new column for the rank. 
    
    Args:
        df: The commentor_summary dataframe.
        
    Returns: 
        out: A dataframe with index `commentor` and column `rank_lt_qty_sc`.
    """
    out = df.sort_values(by='cnt_slt_s', ascending=False)
    out["rank_lt_qty_sc"] = (out.cnt_slt_s.rank(axis=0, method='first',
                                                  ascending=False))
    out = out["rank_lt_qty_sc"]
    print("Created rank_sum_lifetime_qty.")
    return out


hn_cs = pd.merge(hn_cs, rank_sum_lifetime_qty(hn_cs), left_index=True,
                 right_index=True, how='left')


def rank_overall_saltiest(df):
    """Rank commmentors on overall sum of their lifetime happy & salty scores.
    
    Filters commentors to ensure each:
        Has some happy and some salty comments.
        Has more than 40 total comments.
        Has overall Saltiness < 0.
    Sorts by the overall saltiness score `sum_slt_oall`, i.e. sum of happy+salty
    scores across all comments. From lowest to highest.
    Assigns a rank based on position after sorting.
    Creates a new column for the rank.
 
    Indicates: Indicates a tendancy towards a majority of comments being salty. 
    
    Args:
        df: The commentor_summary dataframe.
        
    Returns: 
        out: A dataframe with index `commentor` and column `rank_oall_slt`.
    """
    out = (df[(df['sum_slt_oall'] < 0) & (df['cnt_slt_s'] > 0) & 
             (df['cnt_slt_h'] > 0) & (df['count_comments'] > 20)]
           .sort_values(by=['sum_slt_oall']))
    out["rank_oall_slt"] = out.sum_slt_oall.rank(axis=0, method='first')
    out = out["rank_oall_slt"]
    print("Created rank_overall_saltiest.")
    return out


hn_cs = pd.merge(hn_cs, rank_overall_saltiest(hn_cs), left_index=True,
                 right_index=True, how='left')


def rank_saltiest_trolls(df):
    """Rank commentors, who lack any positive comments, by overall saltiness.
    
    Filters commentors to ensure each:
        Has no happy comments.
        Has overall Saltiness < 0.
    Sorts by the overall saltiness score `sum_slt_oall`, i.e. sum of happy+salty
    scores across all comments. From lowest to highest.
    Assigns a rank based on position after sorting.
    Creates a new column for the rank.
 
    Reasoning:
        Absolute Lack of positive comments is rare. Typically indicates a 
        purpose made "trolling" account. 
  
    Args:
        df: The commentor_summary dataframe.
        
    Returns: 
        out: A dataframe with index `commentor` and column `rank_oall_slt`.
    """
    out = df[(df["cnt_slt_h"].isnull()) &
             (df['sum_slt_oall'] < 0)].sort_values(by=['sum_slt_oall'])
    out["rank_slt_trolls"] = out.sum_slt_oall.rank(axis=0, method='first')
    out = out["rank_slt_trolls"]
    print("Created rank_saltiest_trolls.")
    return out


hn_cs = pd.merge(hn_cs, rank_saltiest_trolls(hn_cs), left_index=True,
                 right_index=True, how='left')


hn_cs.reset_index(inplace=True)

###  Create Top100 Lists for AMT of Salt Contributed Rank, Qty of Salty Comments Rank, Overall_Saltiest_Rank, & Saltiest_Trolls_Rank & SAVE AS JSON

In [None]:
def top100_amt_salt(df):
    """Saves a .JSON of the Top 100 Commentors by `rank_lt_amt_slt`
    
    Sorts by `rank_lt_amt_slt`
    Creates dataframe of rows [0:100] by `rank_lt_amt_slt`
    Saves dataframe as `top100_AMT_Salt_Contributed.json`
    
    Args:
        df: The commentor_summary dataframe w/ ranks.
    """ 
    top100 = (df[df["rank_lt_amt_slt"].notnull()]
              .sort_values(by=["rank_lt_amt_slt"]).head(100))
    top100 = top100[["commentor", "rank_lt_amt_slt",
                     "sum_slt_s", "top_salty_comment"]]
    top100.to_json('Final_Data2/top100_AMT_Salt_Contributed.json',
                   orient='records')
    print("Saved top100_AMT_Salt_Contributed.json")

top100_amt_salt(hn_cs)


def top100_qty_salty_comments(df):
    """Creates a dataframe of the Top100 Commentors by `rank_lt_qty_sc`
    
    Sorts by `rank_lt_qty_sc`
    Makes a dataframe of rows [0:100] by `rank_lt_qty_sc`
    Saves dataframe as `top100_AMT_Salt_Contributed.json`
    
    Args:
        df: The commentor_summary dataframe w/ ranks.
    """ 
    top100 = (df[df["rank_lt_qty_sc"].notnull()]
              .sort_values(by=["rank_lt_qty_sc"]).head(100))
    top100 = top100[["commentor", "rank_lt_qty_sc",
                     "cnt_slt_s", "top_salty_comment"]]
    top100.to_json('Final_Data2/top100_QTY_Salty_Comments.json',
                   orient='records')
    print("Saved top100_AMT_Salt_Contributed.json")


top100_qty_salty_comments(hn_cs)


def top100_overall_saltiest(df):
    """Creates a dataframe of the Top100 Commentors by `rank_oall_slt`
    
    Sorts by `rank_oall_slt`
    Makes a dataframe of rows [0:100] by `rank_oall_slt`
    Saves df as a json record of with name `top100_Overall_Saltiest.json`
      
    Args:
        df: The commentor_summary dataframe w/ ranks.
    """ 
    top100 = (df[df["rank_oall_slt"].notnull()]
              .sort_values(by=["rank_oall_slt"]).head(100))
    top100 = top100[["commentor", "rank_oall_slt",
                     "sum_slt_oall", "top_salty_comment"]]
    top100.to_json('Final_Data2/top100_Overall_Saltiest.json', orient='records')
    print("Saved top100_Overall_Saltiest.json")
    

top100_overall_saltiest(hn_cs)


def top100_saltiest_trolls(df):
    """Creates a dataframe of the Top100 Trolls by `rank_slt_trolls`
    
    Sorts by `rank_slt_trolls`
    Makes a dataframe of rows [0:100] by `rank_slt_trolls`
    Saves df as a json record of with name `top100_Saltiest_Trolls.json`
      
    Args:
        df: The commentor_summary dataframe w/ ranks.
    """ 
    top100 = (df[df["rank_slt_trolls"].notnull()]
              .sort_values(by=["rank_slt_trolls"]).head(100))
    top100 = top100[["commentor", "rank_slt_trolls",
                     "sum_slt_oall", "top_salty_comment"]]
    top100.to_json('Final_Data2/top100_Saltiest_Trolls.json',
                                      orient='records')
    print("Saved top100_Saltiest_Trolls.json")


top100_saltiest_trolls(hn_cs)

### Prepare and Save  `hn_cs` as `.csv` for upload to PostgreSQL. 

In [None]:
hn_cs.to_csv('Final_Data2/hn_commentor_summary.csv',index=False)

In [None]:
hn_cs.to_pickle('data/hn_cs.pkl')
print('Dataframe Saved')

## Create HackerNews Overall "Scorecard" Stats

In [None]:
data["year"] = pd.to_datetime(df['comment_time'],unit='s').dt.strftime('%Y')
data["month"] = (pd.to_datetime(df['comment_time'],unit='s').dt.strftime('%Y_%m')).str[-5:]
data["all_time"] = "all_time"

In [None]:
def hn_overall_stats(df):
    df = df.copy()
    # Calculate by All Time
    df["period"] = df['all_time']
    df_s = df[df['is_salty'] == True]
    split = df_s['comment_saltiness'].groupby([df_s['period']]).agg(['count','sum'])
    split = split.rename({'count': 'hn_cnt_slt_s', 'sum': 'hn_sum_slt_s'}, axis='columns')
    overall = df['comment_saltiness'].groupby([df['period']]).agg(['count','sum', 'mean'])
    overall = overall.rename({'sum': 'hn_sum_slt_oall','mean': 'hn_avg_oall','count': 'hn_count_oall'}, axis='columns')
    overall = pd.merge(overall, split, left_index=True, right_index=True, how='left')
    df_a = overall
    # Calculate by Year
    df["period"] = df['year']
    df_s = df[df['is_salty'] == True]
    split = df_s['comment_saltiness'].groupby([df_s['period']]).agg(['count','sum'])
    split = split.rename({'count': 'hn_cnt_slt_s', 'sum': 'hn_sum_slt_s'}, axis='columns')
    overall = df['comment_saltiness'].groupby([df['period']]).agg(['count','sum', 'mean'])
    overall = overall.rename({'sum': 'hn_sum_slt_oall','mean': 'hn_avg_oall','count': 'hn_count_oall'}, axis='columns')
    overall = pd.merge(overall, split, left_index=True, right_index=True, how='left')
    df_b = overall
    # Calculate by Month
    df["period"] = df['month']
    df_s = df[df['is_salty'] == True]
    split = df_s['comment_saltiness'].groupby([df_s['period']]).agg(['count','sum'])
    split = split.rename({'count': 'hn_cnt_slt_s', 'sum': 'hn_sum_slt_s'}, axis='columns')
    overall = df['comment_saltiness'].groupby([df['period']]).agg(['count','sum', 'mean'])
    overall = overall.rename({'sum': 'hn_sum_slt_oall','mean': 'hn_avg_oall','count': 'hn_count_oall'}, axis='columns')
    overall = pd.merge(overall, split, left_index=True, right_index=True, how='left')
    df_c = overall
    # Concat them together
    df = pd.concat([df_a, df_b, df_c])
    return df

hn_stats_summary = hn_overall_stats(data)
display(hn_stats_summary.head(4))

## Get a Summary of User Stats by Month for finding the Saltiest Commenter for each month. 

In [None]:
# CREATE OUR SUMMARY OF USER STATS BY MONTH
def css_get(df, period_text): 
    """Uses comments_data not commentor_summary
    Prepare the df by sorting. 
    Calculate `sum_slt_oall` for each commentor/period.
    Calculate the `top_salty_comment` for each commentor/period.
    Filter by is_salty
    Calculate `sum_slt_s` & `cnt_slt_s` for each commentor/period.
    """
    df["period"] = df[period_text]
    df = df.sort_values(['commentor','comment_saltiness'], ascending=[True, True])
    df_a = df['comment_saltiness'].groupby([df['commentor'],df['period']]).agg(['sum'])
    df_a = df_a.rename({'sum': 'sum_slt_oall'}, axis='columns')
    df_b = (df[['period','commentor','comment_JSON','comment_saltiness']].groupby([df['commentor'], df['period']]).head(1))
    df_b.set_index(['commentor', 'period'], inplace=True)
    df = df[df['is_salty'] == True]
    df_c = df['comment_saltiness'].groupby([df['commentor'],df['period']]).agg(['count','sum'])
    df_c = df_c.rename({'count': 'cnt_slt_s', 'sum': 'sum_slt_s'}, axis='columns')
    df = df_c.join([df_a,df_b], how = 'left')
    df = df.rename(columns = {'comment_JSON': 'top_salty_comment'})
    return df

css_data = pd.concat([css_get(data,"all_time"), css_get(data,"year"), css_get(data,"month")])
css_table = css_data.sort_values(["period"]).reset_index()
css_table.head(4)

## Select the top Saltiest by each of our rank methods for `all_time`, `year`, and by `month`.  Merge them, then merge with `hn_scorecard_summary`. Save as json. 

In [None]:
# By Count of Salty Comments CSC
def hn_agg_a(df):
    df = df.copy()
    df = df.sort_values(['period','cnt_slt_s','sum_slt_s'], ascending=[True, False, True])
    df_b = df[['period','commentor','cnt_slt_s', 'top_salty_comment']].groupby([df['period']]).head(1)
    df_b.set_index(['period'], inplace=True)
    df_b.columns = ['csc_'+ str(col) for col in df_b.columns]    
    return df_b


# By Sum of Salty Comments SSC
def hn_agg_b(df): 
    df = df.copy()
    df = df.sort_values(['period','sum_slt_s','cnt_slt_s'], ascending=[True, True, False])
    df_b = df[['period','commentor','sum_slt_s', 'top_salty_comment']].groupby([df['period']]).head(1)
    df_b.set_index(['period'], inplace=True)
    df_b.columns = ['ssc_' + str(col) for col in df_b.columns]
    return df_b


# By Sum of Overall Salt (Postive + Negative) - SOS
def hn_agg_c(df): # Uses comments_data not commentor_summary
    df = df.copy()
    df = df.sort_values(['period','sum_slt_oall','cnt_slt_s'], ascending=[True, True, False])
    df_b = df[['period','commentor','sum_slt_oall', 'top_salty_comment']].groupby([df['period']]).head(1)
    df_b.set_index(['period'], inplace=True)
    df_b.columns = ['sos_' + str(col) for col in df_b.columns]
    return df_b


# By Saltiest Comment for the Period - SCP
def hn_agg_d(df): # Uses comments_data not commentor_summary
    df = df.copy()
    df = df.sort_values(['period', 'comment_saltiness', 'sum_slt_s', 'cnt_slt_s'], ascending=[True, True, True, False])
    df_b = df[['period', 'commentor', 'comment_saltiness', 'top_salty_comment']].groupby([df['period']]).head(1)
    df_b.set_index(['period'], inplace=True)
    df_b.columns = ['scp_' + str(col) for col in df_b.columns]
    return df_b

hn_agg_csc = hn_agg_a(css_table)
hn_agg_ssc = hn_agg_b(css_table)
hn_agg_sos = hn_agg_c(css_table)
hn_agg_scp = hn_agg_d(css_table)

hn_agg = pd.concat([hn_agg_csc, hn_agg_ssc, hn_agg_sos, hn_agg_scp], axis = 1)
hn_stats_summary_w_agg = pd.concat([hn_stats_summary, hn_agg], axis = 1) 

display(hn_stats_summary_w_agg.shape)
display(hn_stats_summary_w_agg.head(4))
display(hn_stats_summary_w_agg.columns)

hn_stats_summary_w_agg.to_json('Final_Data2/hn_stats_summary_w_agg_v2.json',
                                      orient='records')
display(print("saved hn_stats_summary_w_agg.json"))

## Save the comment dataframe with all of the custom fields as a CSV.

In [None]:
data.to_csv('Final_Data2/hn_comments_full_db_w_custom_fields_v2.csv',index=False)