### Import Libraries

In [2]:
# Import libraries
from ast import arg
from operator import neg
import pandas as pd
import posixpath,ntpath,json,platform,argparse
from datetime import datetime, timedelta

### Time to float 

In [3]:


def time_to_float(time):
    """
        This function converts time to float.
        time(str): string value to convert to float.
    """
    time = str(time)
    if time=="nan":
        return 0
    try:
        (hours_, min_, sec_) = tuple(time.split(":"))
        hour_ = int(hours_)
        min_ = int(min_)
        sec_ = int(sec_)
        return hour_ * 3600 + min_ * 60 + sec_
    except Exception as e:
        print(f"Error {e} occured for {time}")


### Determine Cusstomer Reaction

In [4]:

def determine_customer_reaction(all_data):
    """
        This function determines the customer's reaction, reactions are categoried into
        positive("Love","Care"), negative("Sad","Angry") and neutral("Wow","Haha").
        all_data(dataframe): a pandas dataframe that at least the following columns 
        ["Love","Care","Sad","Angry","Wow","Haha"].
    """
    positive = sum([int(all_data[each_pos]) for each_pos in customer_reaction_["positive"]])
    negative = sum([int(all_data[each_pos]) for each_pos in customer_reaction_["negative"]])
    neutral = sum([int(all_data[each_pos]) for each_pos in customer_reaction_["neutral"]])
    pd_frame = pd.Series([positive,negative,neutral], index=[["positive","negative","neutral"]])

    reaction = pd_frame[pd_frame==pd_frame.max()].index[0][0]
    data_map = [reaction]
    return pd.Series(data_map, index=["customer_reaction"])


### Determine Content Type

In [5]:

def determine_content_type(all_data):
    """
        This function determines the content's type, contents  are categoried into
        general_knowledge(messages that contains words related to entertainment and general information), 
        science_nd_religion(messages that contains words related to science and religion),
        politics(messages that contains words related to politics) and
        peace_nd_violence(messages that contains words related to peace and violence).
        all_data(dataframe): a pandas dataframe that at least the following columns 
        ["entertainment","science","religion","politics","environment_&_economy","peace","violence_&_crime"].
    """
    general_knowledge = sum([int(all_data[each_pos]) for each_pos in content_type_["general_knowledge"]])
    science_nd_religion = sum([int(all_data[each_pos]) for each_pos in content_type_["science_nd_religion"]])
    politics = sum([int(all_data[each_pos]) for each_pos in content_type_["politics"]])
    peace_nd_violence = sum([int(all_data[each_pos]) for each_pos in content_type_["peace_nd_violence"]])

    pd_frame = pd.Series([general_knowledge,science_nd_religion,politics,peace_nd_violence], index=[["general_knowledge","science_nd_religion","politics","peace_nd_violence"]])
    max_result = pd_frame[pd_frame==pd_frame.max()]
    type_ = max_result.index[0][0]
    # print(all_data["Message"],pd_frame,pd_frame.max(),type_,max_result[type_])
    data_map = [type_] if int(max_result[type_]) != 0 else ["no_label"]
    return pd.Series(data_map, index=["content_type"])


### Convert Date to Standard

In [6]:

def convert_date_standard(input_date,format='%Y-%m-%d %H:%M:%S EDT'):
    """
    This function converts time to float.
    input_date(str): string value to convert to standard format.
    format(str): string value used to format input_date.
    """
    date_time_obj = None
    try:
        date_time_obj = datetime.strptime(input_date, format)
    except:
        try:
            date_time_obj = datetime.strptime(input_date, '%Y-%m-%d%H:%M:%S EDT')
        except:
            try:
                date_time_obj = datetime.strptime(input_date, '%Y-%m-%d %H:%M:%S EST')
            except Exception as e:
                print(f"Error occured: {e} for {input_date}")
    return date_time_obj



### Check Output File format 

In [7]:

# check if output file is .csv
def format_outfile_name(filename):
    assert type(filename) == type(" "), "format of filename should be string"
    return filename.split(".")[0]+".csv"


### Convert path from win to linux path

In [8]:

# function to handle user data path
def convert_path_for_win_linux(path_):
    if platform.machine() in ("arm64"):
        path_ = path_.replace(ntpath.sep,posixpath.sep)
    else:
        path_ = path_.replace(posixpath.sep,ntpath.sep)
    return path_


### Open json files

In [9]:

# function to open a json file
def open_json_files(file_name):
    assert ".json" in file_name, "File needs to be of type json"
    with open(file_name) as jsin_file:
        jsin_file_loaded = json.load(jsin_file)
    return jsin_file_loaded


### Check if specific labels are present

In [10]:

# Read data
def label_present(all_data):
    """
        This function counts occurence of words to categories each message in one of the following
        ["entertainment","science","religion","politics","environment_&_economy","peace","violence_&_crime"],
        words related to this categories can be found in `signal_categories.json.
        all_data(dataframe): a pandas dataframe that at least the following columns 
        ["entertainment","science","religion","politics","environment_&_economy","peace","violence_&_crime"].
    """
    label_template_copy = label_template.copy()
    data = all_data["Message"]
    data = str(data)
    signal_categories_keys_ = list(label_template_copy.keys())
    signal_categories_keys_.sort()

    for each_cat in signal_categories_keys_:
        for each_signal in labeller_[each_cat]:
            if each_signal in data:
                label_template_copy[each_cat] += 1
                # print(f"Key is {each_cat} and signal is {each_signal}")
    # if sum(label_template.values())==0:
    #     label_template["general"] = 1
    # total_signals = sum([label_template[each_cat] for each_cat in signal_categories_keys_])
    # data_map = [1 if (label_template[each_cat]/total_signals) > 0.1 else 0 for each_cat in signal_categories_keys_]
    data_map = [label_template_copy[each_cat] for each_cat in signal_categories_keys_]
    return pd.Series(data_map, index=signal_categories_keys_)


### Run labellers and Processors

In [13]:

print("Program Started Running ...")
# Read JSON signal_categories file
labeller_ = open_json_files("process_json/signal_categories.json")

# Customer reaction data
customer_reaction_ = open_json_files("process_json/customer_reaction.json")
content_type_ = open_json_files("process_json/content_type.json")

# Read JSON template file
label_template = open_json_files("process_json/template.json")

important_columns = ['Post Created', 'Video Share Status', 'Total Views', 
                    'Total Views For All Crossposts', 'Video Length', 
                    'Message', 'Link Text', 'Likes at Posting','Likes',
                    'Comments','Shares','Love','Wow','Haha','Sad', 'Angry', 
                    'Care']
file_name = "data/raw/vice_data_for_test_task.csv"
outputname = format_outfile_name("data/output/vice_data_with_signal_category.csv")

file_name = convert_path_for_win_linux(file_name)
vice_data = pd.read_csv(file_name)[important_columns].reindex()
vice_data['Post Created'] = vice_data[['Post Created']].applymap(convert_date_standard)
vice_data['Video Length'] = vice_data[['Video Length']].applymap(time_to_float)
vice_data2 = vice_data.apply(label_present,axis=1).reindex()
vice_data = pd.concat([vice_data, vice_data2], axis=1)
vice_data3 = vice_data.apply(determine_customer_reaction,axis=1).reindex()
vice_data = pd.concat([vice_data, vice_data3], axis=1)
vice_data4 = vice_data.apply(determine_content_type,axis=1).reindex()
vice_data = pd.concat([vice_data, vice_data4], axis=1).reindex()
vice_data.to_csv(outputname,index=False)
print(f"File output saved in {outputname}.\nProgram Stopped Running.")

Program Started Running ...
File output saved in data/output/vice_data_with_signal_category.csv.
Program Stopped Running.


### <a href="https://datastudio.google.com/reporting/5f526187-6b3c-404c-87bc-97be2b3c7827/page/0bQoC/edit">Visualisation - Click to view Additional dashboard link </a>

In [17]:
# ! pip install seaborn

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# import new data
new_data = pd.read_csv("data/output/vice_data_with_signal_category.csv")

![All_Time_Post_Content_Analysis](data/output/visualisations/All_Time_Post_Content_Analysis.png)

![All_Time_Post_Content_Category_Analysis](data/output/visualisations/All_Time_Post_Content_Category_Analysis.png)

![All_Time_Source_Analysis](data/output/visualisations/All_Time_Source_Analysis.png)

![All_Time_Total_Post_Analysis](data/output/visualisations/All_Time_Total_Post_Analysis.png)

![All_Time_User_Reaction_Analysis](data/output/visualisations/All_Time_User_Reaction_Analysis.png)

![All_Time_User_Reaction_Category_Analysis](data/output/visualisations/All_Time_User_Reaction_Category_Analysis.png)

![All_Time_Video_Length_Analysis](data/output/visualisations/All_Time_Video_Length_Analysis.png)

![Content_Category_Overview](data/output/visualisations/Content_Category_Overview.png)

![Content_Overview_By_User_Reaction](data/output/visualisations/Content_Overview_By_User_Reaction.png)

![Post_Source](data/output/visualisations/Post_Source.png)

![User_Reaction_By_Total_Posts](data/output/visualisations/User_Reaction_By_Total_Posts.png)

![User_Reaction_Overview](data/output/visualisations/User_Reaction_Overview.png)

TEST SUBMISSION: DATA ANALYST <br>
Below contains the details on findings from analysis done on the vice dataset. 

** Important metrics: **

* Post Created – the timestamp denoting when the given post was made;

* Video Share Status – owned (originally released by that page) vs crosspost (reposted from a different page);

* Total Views – views the video amassed when it was published that one time;

* Total Views For All Crossposts -  views the video has amassed over all times it was posted;

* Video Length – duration of the video in minutes;

* Message and Link Text – the description and title text viewers see next to the video;

* Likes/Comments/Shares/Love/… - ways how viewers have engaged with the video.

** Extra columns generated (these columns where generated to analyze content types using the message column, aim of this is to analyze the relationship between the categories of topics posted and the user interaction.): **

* entertainment - counts all occurrence of words from post related to entertainment;

* environment_&_economy - counts all occurence of words related to environment and economy post;

* peace - counts all occurrence of words in post that has to do with non violence;

* politics - counts all occurrence of words in post that has to do with politics;

* religion - counts all occurence of words that has to do with religion and tradition;

* science - counts all occurrence of words that has to do with science
​
* ​violence_&_crime - counts all occurrences of words that have to do with violence and crime.

* customer_reaction - this categorize customer reactions into 3 classes (positive, negative and neural)

* content_type - this classifies content sub-topics into 5 classes (no_label, general_knowledge, science_nd_religion, politics and peace_nd_violence) 

Code: <br>
<a href="https://datastudio.google.com/reporting/5f526187-6b3c-404c-87bc-97be2b3c7827/page/0bQoC/edit"> Code to process data and to generate new columns can be found here </a>


## Findings

Answers(1) - Based on the data, comment on how VICE’s content strategy has shifted over time. You are free to focus on just a few aspects of your choice.

* Based on the data, Overtime the vice strategy maintains a constant high post source from `cross posts` while the rest of the post is distributed between `Owned` and `Shared` posts.

* The All time content analysis also revealed a constantly high post rate of `entertainment` related post from `May 2018` to `Oct 2018`, followed by an interwoven high post rate between `science` related post and `entertainment` related posts from `Oct 2018` to `Jan 2020`, After which a very high post rate of `science` related post was recorded in `March 2020` to `May 2020`, This same month and year a high post rate in `video length` from contents and a high `like reaction` from the user was recorded . In `Nov 2019` a very `high neural reaction` was recorded as `religion` related posts clocked its  highest all time post rate.

* Finally on post source analysis, using the drill down feature available on the chart, we would find that 86% of vice total posts are from crosspost, 13% of vice total posts are owned, while the remaining 0.4% is shared. 

        - Drilling down one level into the chart for crosspost, we would find that crosspost has an almost balanced post rate between `general knowledge` posts and science posts at 24.6% for `science_nd_religion` and 24.2% for `general_knowledge`, for politics related posts we have a 12% rate and a 8.8% rate for violence_nd_peace related posts.

        - Drilling down one level into the chart for owned, we would find that owned posts has a very high rate post on `general knowledge` at 38.6%, followed by a science posts at 16.1% for `science_nd_religion`, for politics related posts we have a 8% rate and a 5.6% rate for violence_nd_peace related posts.

        - Drilling down one level into the chart for shared posts, we experience a bit in the post strategy, we would find that shared post has a high general_knowledge post at 26.3% and  a balanced post rate between `politics` posts and science_nd_religion posts at 8.8% for `politics` and 8.8% for `science_nd_religion`, finally we have a 3.7% rate for violence_nd_peace related posts.

        - Across all posts sources we can conclude that vice post less `violence` and `crime` related posts and posts related to `general_knowledge`, `science` and `religion` are more promoted.

Answers(2) - Are all kinds of engagement beneficial for video popularity? Naturally, a more popular video will have more reactions of all kinds, but does a higher fraction of, say, “Angry” reactions, have a negative effect on video performance?

* From the all time trend analysis in the dashboard, we would notice an `opposite reaction` between `negative reactions` and `positive reactions`, videos with `high positive reactions` tend to have `low negative reactions` when compared to previous trends. So maybe individual user reactions might not directly have a clear effect on the performance, but with a further classification into clear `positive`, `negative` and `neutral` reactions, we can see that a grouped `negative reaction` can in turn affect a grouped `positive reaction`.

Answers(3) - Are there any topics, word combinations which always perform higher than average, or have been successful as of recently? Hint: you could use NLP.

* Given the high rate of `science` and `entertainment` related posts made, We would notice an all time direct positive effect of these topics on the positive reactions we get, we would also notice that as topics on violence and crime begin to tip to the top, positive reactions had a low performance.

### Note

Notes:
* Word combinations for each sub-topics generated for this analysis can be found here <br>
* Sub-topic combination for post categories generated for this analysis can be found here <br>
* User reaction combination to get reaction category for this analysis can be found here <br>
* Final processed data used for this analysis can be found here <br>
* Main script used to process data can be found here or this <br>