The root problem is that the manager of a PR firm would like to understand the strength of the relationships between her employees and their various contacts. If she has multiple employees that are in contact with the same person, how does she quantify the magnitude of their relationships, and who does he assign to reach out to that contact? 

The challenge we're addressing here is predicting whether a pitch email is likely to result in a positive result.

# IMPORT 

In [None]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from tqdm import tqdm # This is an awesome package for tracking for loops
from collections import OrderedDict, Counter
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

#from google.cloud import language

%matplotlib inline

In [None]:
# pitches = pd.read_json('/Users/thomasmulhern/Downloads/pitchesDataJson/pitch.json')
# responses = pd.read_json('/Users/thomasmulhern/Downloads/pitchesDataJson/response.json')
# followup = pd.read_json('/Users/thomasmulhern/Downloads/pitchesDataJson/followup.json')
# results = pd.read_json('/Users/thomasmulhern/Downloads/pitchesDataJson/result.json')

In [None]:
# Local file with functions for opening and writing .pickle files from/to the local 'pickles' folder
from useful_functions import openp, writep

In [None]:
pitches = openp('pitches')

In [None]:
followup = openp('followup')

In [None]:
responses = openp('responses')

In [None]:
results = openp('results')

In [None]:
pitches_with_results = openp('pitches_with_results')

In [None]:
pitches

In [None]:
followup

In [None]:
responses

In [None]:
results

In [None]:
pitches_with_results

### Labels that are in Results, but not Pitches

In [None]:
results_unique_labels = list(results.columns.difference(pitches.columns))
results_unique_labels

### Labels that are in Pitches, but not Results

In [None]:
pitches_unique_labels = list(pitches.columns.difference(results.columns))
pitches_unique_labels

In [None]:
# Are the results only positive?
# ANSWER: NO

results[results['type']=='Other']['tone'].value_counts()

# EDA

#### What percentage of the results are positive?

In [None]:
# What percentage of the results are positive?
tonecounts = results['tone'].value_counts()
positive_precentage = tonecounts['Positive'] / sum(tonecounts)

print ('positive result count: {0} \ntotal result count: {1}\n% of positive result:\
 {2}%'.format(tonecounts['Positive'],sum(tonecounts),np.round(positive_precentage, 3)*100))

#### How many unique users have positive results?

In [None]:
# How many users?
num_users = len(pitches['user_id'].unique())

# How many users with positive results?
user_with_results = len(results['user_id'].unique())
positive_results = results[results['tone']=='Positive']
positive_results_users = positive_results['user_id'].unique()
positive_results_users_percentage = len(positive_results_users) / num_users

print('users making pitches: {}\nusers with results: {}\nusers with positive results: {} \n% of users with positive\
 results: {}%'.format(num_users,user_with_results, len(positive_results_users),
                              np.round(positive_results_users_percentage, 3)*100))

#### How many campaigns end with positive results?

In [None]:
# How many campaigns pitched?
num_campaigns = len(pitches['campaign_id'].unique())

# How many campaigns end with results?
campaigns_with_results = results['campaign_id'] #len(campaigns_with_results.unique())

# How many campaigns ended with positve results?
campaigns_with_positive_results = len(positive_results['campaign_id'].unique())

# What percentage of campaigns end with positive results?
positive_campaigns_percentage = campaigns_with_positive_results / num_campaigns

print('number of campaigns: {0} \ncampaigns ending in results: {1} \ncampaigns ending in positive results: \
{2} \n% of campaigns ending in positive results: {3}%'.format(num_campaigns, len(campaigns_with_results.unique()),
                                                              campaigns_with_positive_results,
                                                              np.round(positive_campaigns_percentage, 3)*100))

#### Which campaigns ended with a positive result?

In [None]:
# Which campaigns ended with positive results? 
successful_campaigns = results[results['tone']=='Positive']['campaign_id'].unique()

print('first five positive campaigns (unordered): \n{0}'.format(successful_campaigns[:5]))

#### How many contacts create positive results?

In [None]:
# How many contacts are there?
num_contacts = len(pitches['contact_id'].unique())

# How many contacts have delivered some result?
num_contacts_results = len(results['contact_id'].unique())

# How many contacts have delivered positive results?
num_contacts_positive_results =  len(positive_results['contact_id'].unique())

# Percentage of contacts that end with positive results?
contacts_positive_results_percentage = num_contacts_positive_results / num_contacts

print('number of contacts: {0}\ncontacts with results: {1}\ncontacts with positive results: {2}\n% of contacts\
 with positive results: {3}%'.format(num_contacts, num_contacts_results, num_contacts_positive_results,
                                    np.round(contacts_positive_results_percentage, 3)*100))

#### Who are the top clients by number of campaigns?

In [None]:
# How may clients are running campaigns?
clients = len(pitches['client_id'].unique())

# How many campaigns has each client run?
grouped = pitches.groupby('client_id')
campaigns_per_client = dict(grouped.campaign_id.count())

# Who are the top clients by number of campaigns?
top_clients = OrderedDict(Counter(campaigns_per_client).most_common(20))

print('Number of clients: ', clients)
print('\nTop clients by number of campaigns (id, count): ')
top_clients


#### How many successful campaigns have they had?

In [None]:
# Top client's count of positive results
def calc_top_client_positive_results(lst):
    positive_res = OrderedDict()
    for key in lst:
        positive_res[key] = len(positive_results[positive_results['client_id']==key])
    return positive_res
top_client_positive_results = calc_top_client_positive_results(top_clients)

print("Top client's count of positive results:")
top_client_positive_results

#### What are those client's rates of success?

In [None]:
success_percent = OrderedDict()
for k,v in top_clients.items():
    for p,q in top_client_positive_results.items():
        if k==p:
            success_percent[k] = ('{0}%'.format(np.round((q/v)*100, 1)))
print("Top client's historical rates of positive success:")
success_percent

In [None]:
plt.bar(range(len(success_percent)), success_percent.values(), align = 'center')
_ = plt.xticks(range(len(success_percent)), success_percent.keys())
plt.margins(0.05, 0)

#### Who are the most prolific users?

In [None]:
# How may users have run campaigns?
users = len(pitches['user_id'].unique())

# How many campaigns has each user run?
grouped = pitches.groupby('user_id')
campaigns_per_user = dict(grouped.campaign_id.count())

# Who are the top users by number of campaigns?
top_users = OrderedDict(Counter(campaigns_per_user).most_common(20))

print('Number of users: ', users)
print('\nTop 20 users by number of campaigns (id, count): ')
top_users


In [None]:
z =  np.round(sum(top_users.values())/len(pitches)*100, 1)
print('The top 20 users have {}% of the pitches'.format(z))

#### Who are the most successful users?

In [None]:
grouped = positive_results.groupby('user_id')
successful_campaigns_per_user = dict(grouped.campaign_id.count())

#successful_campaigns_per_user
successful_users = OrderedDict(Counter(successful_campaigns_per_user).most_common(20))
successful_users

In [None]:
z =  np.round(sum(successful_users.values())/len(positive_results), 3)*100
print('The top 20 users have {}% of the positive results'.format(z))

In [None]:
positive_results
len(positive_results)

In [None]:
len(positive_results['campaign_id'].unique())

In [None]:
list(positive_results['campaign_id'].unique())

In [None]:
x = [type(x) for x in positive_results['pitch_id']]
len(x)

In [None]:
x = [type(x) for x in positive_results['pitch_id']]
Counter(x)

In [None]:
type(positive_results['pitch_id'][1])

In [None]:
positive_results.columns

In [None]:
len(results)

In [None]:
results['tone'].value_counts()

In [None]:
10329/sum(results['tone'].value_counts())

In [None]:
results['tone'].unique()

In [None]:
cresults = results[results['tone']!="Don't even count it"]

In [None]:
cresults['tone'].value_counts()

In [None]:
x = cresults['pitch_id'].isnull().value_counts()
x

In [None]:
nums = 5240/12769
nums

In [None]:
nulls = 1 - 0.41036886208786905
nulls

In [None]:
results_with_pitch_id = results['pitch_id']

In [None]:
results_with_pitch_id = np.array([x for x in results_with_pitch_id\
                                  if type(x) != type(None)])

In [None]:
len(results_with_pitch_id)

## ADD COLUMN TO PITCHES THAT TIES THEM TO RESULTS

In [None]:
#print(len(pitches['results']), len(pitches['id']))
#pitches['results'])
# pitches['results'] = 0
def add_results(series):
    
# pitches['results'][pitches['id'].isin(results_with_pitch_id)] = 1


In [None]:
pitches['results'].value_counts()

In [None]:
pitches

In [None]:
len(pitches)

In [None]:
len(results)

In [None]:
#len(positive_results)

In [None]:
#results['pitch_id'].value_counts()

In [None]:
results['pitch_id'].value_counts()

In [None]:
len(results['pitch_id'].unique())

In [None]:
x = [type(x) for x in positive_results['pitch_id']]
Counter(x)

In [None]:
4204/90573

In [None]:
# pitches['score'] = 0
# pitches['magnitude'] = 0

In [None]:
# pitches_f = open('../pickles/pitches.pickle', 'rb')
# pitches =  pickle.load(pitches_f)
# pitches_f.close()

In [None]:
def gc_sentiment(text):  
    
    client = language.LanguageServiceClient()
    document = language.types.Document(
            content=text,
            type=language.enums.Document.Type.PLAIN_TEXT)
    annotations = client.analyze_sentiment(document=document)
    score = annotations.document_sentiment.score
    magnitude = annotations.document_sentiment.magnitude
    return score, magnitude

In [None]:
# gc_results = [gc_sentiment(row) for row in tqdm(pitches['body'][:10], ncols = 100)]
# gc_score, gc_magnitude = zip(*gc_results) # Unpacking the result into 2 lists
# gc = list(zip(pitches['body'][:10], gc_score, gc_magnitude))
# columns = ['body', 'score', 'magnitude']
# gc_df = pd.DataFrame(gc, columns = columns)

In [None]:
positive_results


In [None]:
str(pitches['body'][138])

In [None]:
str(list(pitches[pitches['id'] == '1cba4a35-6bed-4b72-8682-fc0c16fe3b54']['body']))

In [None]:
len(pitches['body'])

In [None]:
(pitches['body']==None).value_counts()

In [None]:
# for idx, pitch in enumerate(pitches['body']):
#     if type(pitch) is type(None):
#         pitches['body'].drop(idx, inplace=True)
#         print(idx)
#     if (len(pitch) < 10) or (type(pitch) == None):
#         print(idx)
#         pitches['body'].drop(idx, inplace=True)


In [None]:
pitches_f = open('../pickles/pitches.pickle', 'rb')
pitches =  pickle.load(pitches_f)
pitches_f.close()

In [None]:
pos_id = results['pitch_id'].dropna().unique()
pos_id

In [None]:
pd.crosstab(pitches_with_results.contact_id, pitches_with_results.user_id)

In [None]:
pd.crosstab(pitches.contact_id, pitches.user_id).describe()

In [None]:
pitches_with_results.columns