# Answers and Analysis/Findings for Thursday Board Meeting

In [1]:
# Importing necessary tools and files
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics


import joint_acquire
import joint_prepare
import env


In [2]:
# Utilizing acquire.py to assign dataset to `df`
df = joint_acquire.acquire_logs(user=env.user, password=env.password, host=env.host)
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900218,2021-04-21,16:41:51,jquery/personal-site,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900219,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900220,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900221,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0


In [3]:
# Utilizing prepare.py to clean and prep dataset;
# Separated data with lack of context into `df_admin`
df, df_admin = joint_prepare.prepare_logs(df)

**This notebook displays dataframes with default settings** \
**If you wish to see full dataframes, uncomment and run the following cell**

In [4]:
# Expanding view of dataframes and information in this notebook
##### Uncomment the below lines for expanded view #####
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

-----------------------------------------------------------------------------------------------------------
## 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [5]:
# Dataframe Variable Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 847329 entries, 0 to 900222
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           847329 non-null  object 
 1   time           847329 non-null  object 
 2   path           847329 non-null  object 
 3   user_id        847329 non-null  int64  
 4   cohort_id      847329 non-null  float64
 5   ip             847329 non-null  object 
 6   name           847329 non-null  object 
 7   start_date     847329 non-null  object 
 8   end_date       847329 non-null  object 
 9   created_at     847329 non-null  object 
 10  program_id     847329 non-null  float64
 11  primary_topic  847329 non-null  object 
 12  subtopic       847329 non-null  object 
 13  tertiary       847329 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 129.2+ MB


In [6]:
# Dataframe Overview
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,program_id,primary_topic,subtopic,tertiary
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,object-oriented-programming,
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,slides,object_oriented_programming,
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2.0,javascript-i,conditionals,


In [7]:
# Splitting dataframe by program into 4 separate dfs
full_stack_php = df[df.program_id == 1]
full_stack_java = df[df.program_id == 2]
data_science = df[df.program_id == 3]
front_end = df[df.program_id == 4]

### Full-Stack PHP Program

In [8]:
# Full-Stack PHP: Count of Modules Accessed by Cohort
php_primary_count = (full_stack_php.groupby('name').primary_topic.value_counts())

In [9]:
# Full-Stack PHP: Percentage of Modules Accessed by Cohort
php_primary_count_pct = (full_stack_php.groupby('name').primary_topic.value_counts(normalize=True))

In [10]:
# Dataframe of Count of Modules Accessed by Cohort
pd.DataFrame(php_primary_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Arches,javascript-i,1333
Arches,html-css,1031
Arches,spring,914
Arches,mysql,775
Arches,java-iii,696
...,...,...
Quincy,java-iii,3
Quincy,jquery,3
Quincy,introduction,2
Quincy,13-advanced-topics,1


In [11]:
# Dataframe of Percentage of Modules Accessed by Cohort
pd.DataFrame(php_primary_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Arches,javascript-i,0.149944
Arches,html-css,0.115973
Arches,spring,0.102812
Arches,mysql,0.087177
Arches,java-iii,0.078290
...,...,...
Quincy,java-iii,0.002425
Quincy,jquery,0.002425
Quincy,introduction,0.001617
Quincy,13-advanced-topics,0.000808


In [12]:
# Full-Stack PHP: Count of Lessons Accessed by Cohort
php_sub_count = (full_stack_php.groupby('name').subtopic.value_counts())

In [13]:
# Full-Stack PHP: Percentage of Lessons Accessed by Cohort
php_sub_count_pct = (full_stack_php.groupby('name').subtopic.value_counts(normalize=True))

In [14]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(php_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,,1803
Arches,,626
Arches,fundamentals,574
Arches,introduction,362
Arches,css-ii,320
...,...,...
Quincy,misleading1_fox.jpg,1
Quincy,misleading3_deaths.jpg,1
Quincy,modules,1
Quincy,project,1


In [15]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(php_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,,0.202812
Arches,,0.070416
Arches,fundamentals,0.064567
Arches,introduction,0.040720
Arches,css-ii,0.035996
...,...,...
Quincy,misleading1_fox.jpg,0.000808
Quincy,misleading3_deaths.jpg,0.000808
Quincy,modules,0.000808
Quincy,project,0.000808


### Full-Stack Java Program

In [16]:
# Full-Stack Java: Count of Modules Accessed by Cohort
java_primary_count = (full_stack_java.groupby('name').primary_topic.value_counts())

In [17]:
# Full-Stack Java: Percentage of Modules Accessed by Cohort
java_primary_count_pct = (full_stack_java.groupby('name').primary_topic.value_counts(normalize=True))

In [18]:
# Dataframe of Count of Modules Accessed by Cohort
pd.DataFrame(java_primary_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Andromeda,javascript-i,4077
Andromeda,mysql,3029
Andromeda,html-css,2865
Andromeda,spring,2850
Andromeda,java-iii,2117
...,...,...
Zion,Intro_to_Regression,1
Zion,arash-arghavan,1
Zion,collections,1
Zion,git,1


In [19]:
# Dataframe of Percentage of Modules Accessed by Cohort
pd.DataFrame(java_primary_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Andromeda,javascript-i,0.160771
Andromeda,mysql,0.119445
Andromeda,html-css,0.112978
Andromeda,spring,0.112386
Andromeda,java-iii,0.083481
...,...,...
Zion,Intro_to_Regression,0.000026
Zion,arash-arghavan,0.000026
Zion,collections,0.000026
Zion,git,0.000026


In [20]:
# Full-Stack Java: Count of Lessons Accessed by Cohort
# Full-Stack Java: Percentage of Lessons Accessed by Cohort
java_sub_count = (full_stack_java.groupby('name').subtopic.value_counts())
java_sub_count_pct = (full_stack_java.groupby('name').subtopic.value_counts(normalize=True))

In [21]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(java_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Andromeda,,4381
Andromeda,fundamentals,1757
Andromeda,introduction,1387
Andromeda,,1176
Andromeda,css-i,982
...,...,...
Zion,mysql,1
Zion,pair-programming,1
Zion,professional-development,1
Zion,traversing,1


In [22]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(java_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Andromeda,,0.172759
Andromeda,fundamentals,0.069285
Andromeda,introduction,0.054695
Andromeda,,0.046374
Andromeda,css-i,0.038724
...,...,...
Zion,mysql,0.000026
Zion,pair-programming,0.000026
Zion,professional-development,0.000026
Zion,traversing,0.000026


### Data Science Program

In [23]:
# Data Science: Count of Lessons Accessed by Cohort
ds_sub_count = (data_science.groupby('name').subtopic.value_counts())

In [24]:
# Data Science: Percentage of Lessons Accessed by Cohort
ds_sub_count_pct = (data_science.groupby('name').primary_topic.value_counts(normalize=True))

In [25]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(ds_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Bayes,,1967
Bayes,1-overview,1826
Bayes,project,1024
Bayes,modern-data-scientist.jpg,674
Bayes,AI-ML-DL-timeline.jpg,672
...,...,...
Florence,spark-execution-diagram.svg,1
Florence,spark-local-mode.svg,1
Florence,spreadsheets-overview,1
Florence,user-defined-functions,1


In [26]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(ds_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Bayes,1-fundamentals,0.117718
Bayes,6-regression,0.096955
Bayes,4-python,0.090135
Bayes,3-sql,0.087158
Bayes,,0.074120
...,...,...
Florence,group-by,0.000117
Florence,joins,0.000117
Florence,ml-methodologies-drawing.jpg,0.000117
Florence,sql.mysql-overview,0.000117


### Question 1: Answer
From our exploration, we've determined that the lessons that attract the most traffic consistently across cohorts are the fundamentals and concept introduction pages. Regardless of program, it seems that these initial pages, and associated fundamental concepts are most visited across cohorts.

-----------------------------------------------------------------------------------------------------------
## 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?


Analysis performed above answering the first question can also be used to answer this second question. 

**Takeaways: full stack php**
1. laravel is referred to significantly more by 'Kings' cohort than other cohorts; most of whom gloss over it.

**Takeaways: full stack java**
1. No outstanding differences within full stack java cohorts (all are rather uniform)

**Takeaways: data science**
1. Cohorts Darden and Easley referred to 'scale_features_or_not' significantly more than other cohorts

### Question 2: Answer
From our exploration, we've determined:
- Full-Stack Java Cohorts: No outstanding differences between cohorts (all are uniform)
- Full-Stack PHP Cohorts: Kings cohort refers to 'laravel' significantly more than other cohorts
- Data Science Cohorts: Darden and Easley cohorts refer to 'scale_features_or_not' significantly more than other cohorts

-----------------------------------------------------------------------------------------------------------
## 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [27]:
# Creating df that only pulls data of students who are actively in their class period
active_students = df[(df.date <= df.end_date)&(df.date >= df.start_date)]

In [28]:
# describe value counts of user_id, shows hits
active_students.user_id.value_counts().describe()

count     731.000000
mean      885.766074
std       647.254908
min         1.000000
25%       391.500000
50%       795.000000
75%      1235.500000
max      4786.000000
Name: user_id, dtype: float64

In [29]:
# Because the user_id is used on multiple rows, set it as index so it can be explored
active_students = active_students.set_index('user_id')

In [30]:
# Simplify dataframe, drop unused columns
active_students = active_students.drop(columns=['start_date','end_date','created_at','date','time'])

In [31]:
#  Shows simplified df
active_students.head()

Unnamed: 0_level_0,path,cohort_id,ip,name,program_id,primary_topic,subtopic,tertiary
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,javascript-i/conditionals,22.0,97.105.19.61,Teddy,2.0,javascript-i,conditionals,
2,javascript-i/loops,22.0,97.105.19.61,Teddy,2.0,javascript-i,loops,
3,javascript-i/conditionals,22.0,97.105.19.61,Teddy,2.0,javascript-i,conditionals,
3,javascript-i/functions,22.0,97.105.19.61,Teddy,2.0,javascript-i,functions,
2,javascript-i/loops,22.0,97.105.19.61,Teddy,2.0,javascript-i,loops,


In [32]:
# Utilizing simplified dataframe, we setup where we draw the line for limited use of active students, around 300 hits, below the lower IQR of 193
low_use_active_students = active_students[active_students.index.value_counts() <=300]
low_use_active_students.head()

Unnamed: 0_level_0,path,cohort_id,ip,name,program_id,primary_topic,subtopic,tertiary
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,mkdocs/search_index.json,21.0,97.105.19.61,Sequoia,2.0,mkdocs,search_index.json,
10,git/merge-conflict-demo,21.0,97.105.19.61,Sequoia,2.0,git,merge-conflict-demo,
10,mkdocs/search_index.json,21.0,97.105.19.61,Sequoia,2.0,mkdocs,search_index.json,
10,git/working-in-teams,21.0,97.105.19.61,Sequoia,2.0,git,working-in-teams,
10,/,21.0,97.105.19.61,Sequoia,2.0,,,


In [33]:
# Number of students with less than 300 total curriculum visits while active
low_use_active_students.index.nunique()

154

In [34]:
# Total number of students
df.user_id.nunique()

911

In [35]:
# Calculating percentage of low_use students
low_use_active_students.index.nunique() / df.user_id.nunique()

0.1690450054884742

In [36]:
# Visualizing simplified dataframe showing low use active students by cohort and number of curriculum visits
(pd.DataFrame(active_students[active_students.index.value_counts() <=150].groupby('user_id').name.value_counts())).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,name
user_id,name,Unnamed: 2_level_1
13,Teddy,79
16,Sequoia,141
17,Sequoia,134
22,Sequoia,114
24,Sequoia,26
...,...,...
976,Oberon,29
977,Oberon,86
978,Oberon,55
979,Oberon,127


**Note**  \
If we have time: Can we see how often these same students utilized the curriculum after they graduated?

### Question 3: Answer
From our exploration, we've determined that roughly 17% of students hardly accessed the curriculum when active. These students all had 300 or less total curriculum visits during their tenure as a Codeup student. These students represent almost every cohort. 

-----------------------------------------------------------------------------------------------------------
## 4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?

In [58]:
#create a df and use groupby with date, time, ip and the aggregation on the time, reset index, sort by time
scrapingip = df.groupby(['date', 'time', 'ip'])[['time']].agg([ 'count']).reset_index().sort_values(by=[('time', 'count')], ascending=False)


In [None]:
# Showing df
scrapingip.head()


In [None]:
# Showing df
scrapingip.columns


In [None]:
# We can see 2 ip addresses in particular hit dozens of webpages in a single second
scrapingip[scrapingip[('time', 'count')]> 10]

### Question 4: Answer
From our exploration, we've determined that there is suspicious activity. There are 2 ip addresses in particular that we know to be showing clear evidence of web-scraping. These ip addresses are:
204.44.112.76
172.124.70.146

-----------------------------------------------------------------------------------------------------------
## 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

In [37]:
# Create dataframe of observations that occurred starting with the year 2019
after_2019 = df[(df.date >= '2019-01-01')]

In [38]:
# Create dataframe of observations that occurred prior to the year 2019
before_2019 = df[(df.date < '2019-01-01')]

In [39]:
# Dictionary of path url for web dev program
web_dev_path= pd.DataFrame(after_2019[after_2019.program_id != 3].path.unique())
web_dev_path.rename({0 : 'path'}, axis=1, inplace=True)

In [40]:
# Dictionary of path url for data science program
ds_path = pd.DataFrame(after_2019[after_2019.program_id == 3].path.unique())
ds_path.rename({0 : 'path'}, axis=1, inplace=True)

In [41]:
# Adds new column with bootlean comparing after_2019['path'] to data science program urls dictionary
after_2019['ds_hit'] = after_2019['path'].isin(ds_path['path'])

In [42]:
# Adds new column with bootlean comparing after_2019['path'] to web dev program urls dictionary
after_2019['web_dev_hit'] = after_2019['path'].isin(web_dev_path['path'])

In [43]:
# Adds new column with bootlean comparing after_2019['web_dev_hit'] to after_2019['ds_hit']
after_2019['both_programs'] =  (after_2019.ds_hit == after_2019.web_dev_hit)

In [44]:
# Adds new column with bootlean comparing before_2019['path'] to data science program urls dictionary
before_2019['ds_hit'] = before_2019['path'].isin(ds_path['path'])

In [45]:
# Adds new column with bootlean comparing before_2019['path'] to web dev program urls dictionary
before_2019['web_dev_hit'] = before_2019['path'].isin(web_dev_path['path'])

In [46]:
# Adds new column with bootlean comparing before_2019['web_dev_hit'] to before_2019['ds_hit']
before_2019['both_programs'] =  (before_2019.ds_hit == before_2019.web_dev_hit)

In [47]:
# Dataframe of observations with both conditions of after_2019['both_programs] value matches "True" and after_2019['path] value does not math "/"
after_2019[(after_2019.both_programs == True) & (after_2019.path != '/')]

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,program_id,primary_topic,subtopic,tertiary,ds_hit,web_dev_hit,both_programs
175810,2019-01-01,12:56:12,java-i,274,26.0,67.11.239.2,Xanadu,2018-09-17,2019-02-08,2018-09-17 19:09:51,2.0,java-i,,,True,True,True
175814,2019-01-01,15:32:24,toc,301,27.0,72.181.106.116,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,toc,,,True,True,True
175815,2019-01-01,15:32:26,javascript-ii,301,27.0,72.181.106.116,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,javascript-ii,,,True,True,True
175816,2019-01-01,15:32:28,jquery,301,27.0,72.181.106.116,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,jquery,,,True,True,True
175819,2019-01-01,16:38:34,java-iii,262,26.0,97.105.90.179,Xanadu,2018-09-17,2019-02-08,2018-09-17 19:09:51,2.0,java-iii,,,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900214,2021-04-21,16:41:29,javascript-i,64,28.0,71.150.217.33,Staff,2014-02-04,2014-02-04,2018-12-06 17:04:19,2.0,javascript-i,,,True,True,True
900215,2021-04-21,16:41:31,javascript-ii,64,28.0,71.150.217.33,Staff,2014-02-04,2014-02-04,2018-12-06 17:04:19,2.0,javascript-ii,,,True,True,True
900216,2021-04-21,16:41:49,jquery,64,28.0,71.150.217.33,Staff,2014-02-04,2014-02-04,2018-12-06 17:04:19,2.0,jquery,,,True,True,True
900217,2021-04-21,16:41:51,javascript-i/bom-and-dom/dom,875,135.0,24.242.150.231,Marco,2021-01-25,2021-07-19,2021-01-20 21:31:11,2.0,javascript-i,bom-and-dom,dom,True,True,True


In [48]:
# Dataframe of observations with both conditions of before_2019['both_programs] value matches "True" and before_2019['path] value does not math "/"
before_2019[(before_2019.both_programs == True) & (before_2019.path != '/')]

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,program_id,primary_topic,subtopic,tertiary,ds_hit,web_dev_hit,both_programs
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,,,True,True,True
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,object-oriented-programming,,True,True,True
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2.0,javascript-i,conditionals,,True,True,True
6,2018-01-26,09:56:46,javascript-i/conditionals,3,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2.0,javascript-i,conditionals,,True,True,True
13,2018-01-26,10:00:39,javascript-i,6,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2.0,javascript-i,,,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175794,2018-12-31,15:59:42,javascript-i/conditionals,128,23.0,69.232.100.94,Ulysses,2018-03-05,2018-07-19,2018-03-05 14:22:11,2.0,javascript-i,conditionals,,True,True,True
175801,2018-12-31,23:51:33,spring,289,27.0,98.6.94.51,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,spring,,,True,True,True
175802,2018-12-31,23:51:45,java-i,289,27.0,98.6.94.51,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,java-i,,,True,True,True
175804,2018-12-31,23:51:56,java-iii,289,27.0,98.6.94.51,Yosemite,2018-11-05,2019-04-03,2018-11-05 15:26:37,2.0,java-iii,,,True,True,True


### Question 5: Answer
From our exploration, we've determined that:
Users did have access to both curriculums prior to the year 2019. \
We do find evidence that there was an access shut-off during 2019. \
Users continue to have access to both curriculums until April 21, 2021. \
Unsure if users still have access to both curriculums due to last known entry is April 21, 2021 for raw data acquired.

-----------------------------------------------------------------------------------------------------------
## 6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?

In [49]:
#  Observations that occured after each cohort end date by graduate
after_grad = df[(df.date > df.end_date)]

# Create four dataframes for each program
jarrid_full_stack_php = after_grad[after_grad.program_id == 1]
jarrid_full_stack_java = after_grad[after_grad.program_id == 2]
jarrid_data_science = after_grad[after_grad.program_id == 3]
jarrid_front_end = after_grad[after_grad.program_id == 4]


In [50]:
# Dataframe of Full Stack PHP program graduates page visits sorted by highest count
jarrid_full_stack_php.groupby(['program_id', 'path', 'primary_topic', 'subtopic', 'tertiary' ]).agg(['count']).reset_index().sort_values(by=[('user_id', 'count')], ascending=False).head(25)

Unnamed: 0_level_0,program_id,path,primary_topic,subtopic,tertiary,date,time,user_id,cohort_id,ip,name,start_date,end_date,created_at
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,count,count,count,count,count,count,count,count
0,1.0,/,,,,1681,1681,1681,1681,1681,1681,1681,1681,1681
506,1.0,index.html,index.html,,,1011,1011,1011,1011,1011,1011,1011,1011,1011
544,1.0,javascript-i,javascript-i,,,736,736,736,736,736,736,736,736,736
483,1.0,html-css,html-css,,,542,542,542,542,542,542,542,542,542
676,1.0,spring,spring,,,501,501,501,501,501,501,501,501,501
533,1.0,java-iii,java-iii,,,479,479,479,479,479,479,479,479,479
520,1.0,java-ii,java-ii,,,454,454,454,454,454,454,454,454,454
512,1.0,java-i,java-i,,,444,444,444,444,444,444,444,444,444
567,1.0,javascript-ii,javascript-ii,,,429,429,429,429,429,429,429,429,429
64,1.0,appendix,appendix,,,409,409,409,409,409,409,409,409,409


In [51]:
# Dataframe of Full Stack Java program graduates page visits sorted by highest count
jarrid_full_stack_java.groupby(['program_id', 'path', 'primary_topic', 'subtopic', 'tertiary' ]).agg(['count']).reset_index().sort_values(by=[('user_id', 'count')], ascending=False).head(25)

Unnamed: 0_level_0,program_id,path,primary_topic,subtopic,tertiary,date,time,user_id,cohort_id,ip,name,start_date,end_date,created_at
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,count,count,count,count,count,count,count,count
1,2.0,/,,,,12406,12406,12406,12406,12406,12406,12406,12406,12406
1333,2.0,javascript-i,javascript-i,,,4229,4229,4229,4229,4229,4229,4229,4229,4229
1543,2.0,spring,spring,,,3760,3760,3760,3760,3760,3760,3760,3760,3760
1509,2.0,search/search_index.json,search,search_index.json,,3562,3562,3562,3562,3562,3562,3562,3562,3562
1264,2.0,html-css,html-css,,,3136,3136,3136,3136,3136,3136,3136,3136,3136
1319,2.0,java-iii,java-iii,,,3058,3058,3058,3058,3058,3058,3058,3058,3058
1307,2.0,java-ii,java-ii,,,2985,2985,2985,2985,2985,2985,2985,2985,2985
1300,2.0,java-i,java-i,,,2679,2679,2679,2679,2679,2679,2679,2679,2679
702,2.0,appendix,appendix,,,2662,2662,2662,2662,2662,2662,2662,2662,2662
1362,2.0,javascript-ii,javascript-ii,,,2549,2549,2549,2549,2549,2549,2549,2549,2549


In [52]:
# Dataframe of Front End program graduates page visits sorted by highest count
jarrid_front_end.groupby(['program_id', 'path', 'primary_topic', 'subtopic', 'tertiary' ]).agg(['count']).reset_index().sort_values(by=[('user_id', 'count')], ascending=False).head(25)

Unnamed: 0_level_0,program_id,path,primary_topic,subtopic,tertiary,date,time,user_id,cohort_id,ip,name,start_date,end_date,created_at
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,count,count,count,count,count,count,count,count
1,4.0,content/html-css,content,html-css,,2,2,2,2,2,2,2,2,2
0,4.0,/,,,,1,1,1,1,1,1,1,1,1
2,4.0,content/html-css/gitbook/images/favicon.ico,content,html-css,gitbook,1,1,1,1,1,1,1,1,1
3,4.0,content/html-css/introduction.html,content,html-css,introduction.html,1,1,1,1,1,1,1,1,1


In [53]:
# Dataframe of Data Science program graduates page visits sorted by highest count
jarrid_data_science.groupby(['program_id', 'path', 'primary_topic', 'subtopic', 'tertiary' ]).agg(['count']).reset_index().sort_values(by=[('user_id', 'count')], ascending=False).head(25)

Unnamed: 0_level_0,program_id,path,primary_topic,subtopic,tertiary,date,time,user_id,cohort_id,ip,name,start_date,end_date,created_at
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,count,count,count,count,count,count,count,count
0,3.0,/,,,,1436,1436,1436,1436,1436,1436,1436,1436,1436
371,3.0,search/search_index.json,search,search_index.json,,493,493,493,493,493,493,493,493,493
388,3.0,sql/mysql-overview,sql,mysql-overview,,275,275,275,275,275,275,275,275,275
252,3.0,classification/overview,classification,overview,,266,266,266,266,266,266,266,266,266
257,3.0,classification/scale_features_or_not.svg,classification,scale_features_or_not.svg,,219,219,219,219,219,219,219,219,219
194,3.0,anomaly-detection/AnomalyDetectionCartoon.jpeg,anomaly-detection,AnomalyDetectionCartoon.jpeg,,193,193,193,193,193,193,193,193,193
199,3.0,anomaly-detection/overview,anomaly-detection,overview,,191,191,191,191,191,191,191,191,191
284,3.0,fundamentals/AI-ML-DL-timeline.jpg,fundamentals,AI-ML-DL-timeline.jpg,,189,189,189,189,189,189,189,189,189
305,3.0,fundamentals/modern-data-scientist.jpg,fundamentals,modern-data-scientist.jpg,,187,187,187,187,187,187,187,187,187
302,3.0,fundamentals/intro-to-data-science,fundamentals,intro-to-data-science,,184,184,184,184,184,184,184,184,184


**Jarrid's Takeaways**


Top 5 lessons visited by Full Stack Php program graduates:
- index.html
- javascript-i
- html-css
- spring
- java-iii

Top 5 lessons visited by Full Stack Java program graduates:
- javascript-i
- spring
- search/search_index.json
- html-css
- java-iii

Top 5 lessons visited Front End program graduates:

- Front End progarm graduates only have 5 observations since graduating. They only visited lessons that were under the primary topic of 'content'.


Top 5 lessons visited by Data Scientisit program graduates:
- search/search_index.json
- sql/mysql-overview
- classification/overview
- classification/scale_features_or_not.svg
- anomaly-detection/AnomalyDetectionCartoon.jpeg


### Question 6: Answer
From our exploration, we've determined that:

**Top 3 lessons visited**

Web Dev Students (both Java and php):
- javascript-i
- html-css
- spring

Data Science Students:
- index (search)
- sql overview
- classification overview

-----------------------------------------------------------------------------------------------------------
## 7. Which lessons are least accessed?

In [54]:
# Full-Stack PHP: Percentage of Lessons Accessed by Cohort
php_sub_count_pct = (full_stack_php.groupby('name').subtopic.value_counts(normalize=True, ascending=True))
pd.DataFrame(php_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,getting-started,0.000112
Arches,mapbox-api,0.000112
Arches,promisesdfghjkjhgfs,0.000112
Arches,units,0.000112
Arches,5762c2946250b.jpg,0.000225
...,...,...
Quincy,html-css,0.061439
Quincy,mysql,0.068715
Quincy,,0.076799
Quincy,,0.122070


In [55]:
non_cohort_php_sub_count_pct = (full_stack_php.subtopic.value_counts(normalize=True, ascending=True))
pd.DataFrame(non_cohort_php_sub_count_pct)

Unnamed: 0,subtopic
loops.html,0.000033
7.1-ds-libraries-overview,0.000033
media-queries,0.000033
5-functions,0.000033
bootstrap-grid-system,0.000033
...,...
introduction,0.037842
laravel,0.038890
fundamentals,0.047204
,0.055028


In [56]:
# Full-Stack Java: Percentage of Lessons Accessed by Cohort
non_cohortjava_sub_count_pct = (full_stack_java.subtopic.value_counts(normalize=True, ascending=True))
pd.DataFrame(non_cohortjava_sub_count_pct)

Unnamed: 0,subtopic
4.8-feature-scaling,0.000001
testing-code.md,0.000001
array-lists,0.000001
type-annotations,0.000001
submit,0.000001
...,...
css-i,0.043229
fundamentals,0.046215
,0.050283
introduction,0.050820


In [57]:
# Data Science: Percentage of Lessons Accessed by Cohort
non_cohort_ds_sub_count_pct = (data_science.subtopic.value_counts(normalize=True, ascending=True))
pd.DataFrame(non_cohort_ds_sub_count_pct)

Unnamed: 0,subtopic
explore-old,0.000010
5-inspect,0.000010
bom-and-dom,0.000010
creating-custom-fields.md,0.000010
conditionals,0.000010
...,...
AI-ML-DL-timeline.jpg,0.029919
project,0.030345
1-overview,0.034039
overview,0.039609


### Question 7: Answer
From our exploration, we've determined that:

The least accessed lesson per program:
- Full-Stack Java Cohorts: '4.8-feature-scaling'
- Full-Stack PHP Cohorts:  'loops.html'
- Data Science Cohorts: '5-inspect'

Additionally, the 'conditionals' lesson also appears to be accessed very seldom by all 3 programs