In [1]:
import jarvis
import pandas as pd
import numpy as np

# Where do I start? Summarize current and past experiments

In [2]:
summary = jarvis.listVersionSummaries('twitter_demo')

100%|██████████| 2/2 [00:11<00:00,  5.51s/it]


In [3]:
summary

[('c01502400e82d88fffa8a1988085fa04421e2dd2',
     __trialNum__  alpha  frac       model  model_accuracy  split_seed
  0             9    0.9  0.75  NOT LOADED         0.58152          42
  1             0    0.0  0.75  NOT LOADED         0.79514          42
  2             7    0.7  0.75  NOT LOADED         0.60018          42
  3             6    0.6  0.75  NOT LOADED         0.61348          42
  4             1    0.1  0.75  NOT LOADED         0.73202          42
  5            10    1.0  0.75  NOT LOADED         0.57407          42
  6             8    0.8  0.75  NOT LOADED         0.58993          42
  7             4    0.4  0.75  NOT LOADED         0.64544          42
  8             3    0.3  0.75  NOT LOADED         0.66530          42
  9             2    0.2  0.75  NOT LOADED         0.69506          42
  10            5    0.5  0.75  NOT LOADED         0.62654          42),
 ('597cb760c89b1db5b3efd26c981539fffe79bb83',
     __trialNum__  alpha  frac       model  model_accu

# What is the best experiment?
Here, we measure _goodness_ of an experiment by its average model accuracy, over all trials.

In [4]:
summary_stat = list(map(lambda x: (x[0], x[1]['model_accuracy'].mean()), summary))
summary_stat

[('c01502400e82d88fffa8a1988085fa04421e2dd2', 0.6471527272727273),
 ('597cb760c89b1db5b3efd26c981539fffe79bb83', 0.713990909090909)]

In [5]:
summary_stat_df = pd.DataFrame(summary_stat)
best_index = summary_stat_df.iloc[:, 1].idxmax()
best_commit_hash = summary_stat_df.iloc[1, 0]
best_commit_hash

'597cb760c89b1db5b3efd26c981539fffe79bb83'

# What is the best trial, given the best experiment?

In [6]:
best_xp_df = list(filter(lambda x: x[0] == best_commit_hash, summary))[0][1]
best_trial = best_xp_df.iloc[best_xp_df['model_accuracy'].idxmax()]['__trialNum__']
best_trial

'0'

# So the former experiment is better than the latter. What changed?

In [7]:
versions = list(map(lambda x: x[0], summary))
most_recent, least_recent = versions

In [8]:
jarvis.diffExperimentVersions('twitter_demo', least_recent, most_recent)

[0;30mtrain_model.py --> train_model.py
[0;30m@@ -47,8 +47,8 @@ def train(tweet_df, alpha):
[0;30m     ## Convert tweet to bag of words for learning
[0;30m 
[0;30m     # Tokenize Text
[1;31m-    count_vect = CountVectorizer()
[1;31m-    #count_vect = TfidfVectorizer()
[1;32m+    #count_vect = CountVectorizer()
[1;32m+    count_vect = TfidfVectorizer()
[0;30m     X_train = count_vect.fit_transform(tweet_df["tweet"])
[0;30m 
[0;30m     intermediary["vectorizer"] = count_vect


# Let's get the best model yet!
From the best trial, from the best experiment

In [9]:
best_intermediary = jarvis.materialize('twitter_demo', best_trial, best_commit_hash, 'intermediary.pkl')

In [10]:
best_intermediary

{'classifier': MultinomialNB(alpha=0.0, class_prior=None, fit_prior=True),
 'country_dict': {' ': 32,
  ' AD': 21,
  ' AE': 105,
  ' AF': 15,
  ' AG': 119,
  ' AI': 106,
  ' AL': 155,
  ' AM': 148,
  ' AO': 25,
  ' AR': 11,
  ' AT': 62,
  ' AU': 55,
  ' AW': 66,
  ' AZ': 77,
  ' BA': 92,
  ' BB': 79,
  ' BD': 28,
  ' BE': 149,
  ' BG': 29,
  ' BH': 157,
  ' BI': 150,
  ' BJ': 12,
  ' BM': 98,
  ' BN': 18,
  ' BO': 53,
  ' BQ': 156,
  ' BR': 97,
  ' BS': 9,
  ' BW': 91,
  ' BY': 7,
  ' CA': 50,
  ' CD': 162,
  ' CG': 90,
  ' CH': 59,
  ' CI': 40,
  ' CL': 22,
  ' CM': 121,
  ' CN': 36,
  ' CO': 100,
  ' CR': 76,
  ' CU': 102,
  ' CV': 96,
  ' CY': 74,
  ' CZ': 116,
  ' DE': 169,
  ' DK': 34,
  ' DM': 109,
  ' DO': 137,
  ' DZ': 115,
  ' EC': 111,
  ' EE': 80,
  ' EG': 85,
  ' ES': 163,
  ' ET': 142,
  ' FI': 67,
  ' FR': 78,
  ' GA': 112,
  ' GB': 30,
  ' GE': 138,
  ' GH': 145,
  ' GI': 56,
  ' GL': 173,
  ' GN': 108,
  ' GP': 143,
  ' GR': 172,
  ' GT': 19,
  ' GU': 14,
  ' GY': 63,
 

# Here's the best model in action

In [12]:
country_dict = best_intermediary['country_dict']
classifier = best_intermediary['classifier']
vectorizer = best_intermediary['vectorizer']

code_dict = {}

for kee in country_dict:
    code_dict[country_dict[kee]] = kee

while True:
    tweet = input("What's on your mind? ")
    if tweet == 'exit':
        break
    tweet_vec = vectorizer.transform(np.array([tweet,]))
    country_id = classifier.predict(tweet_vec)
    print("Predicted country of origin: {}\n".format(code_dict[country_id[0]]))

What's on your mind? OMG This works!! :D
Predicted country of origin:  US

What's on your mind? exit
