# Answers and Analysis/Findings for Thursday Board Meeting

In [1]:
# Importing necessary tools and files
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics


import joint_acquire
import joint_prepare
import env


In [2]:
# Utilizing acquire.py to assign dataset to `df`
df = joint_acquire.acquire_logs(user=env.user, password=env.password, host=env.host)
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900218,2021-04-21,16:41:51,jquery/personal-site,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900219,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900220,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0
900221,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2.0


In [3]:
# Utilizing prepare.py to clean and prep dataset;
# Separated data with lack of context into `df_admin`
df, df_admin = joint_prepare.prepare_logs(df)

In [4]:
# Expanding view of dataframes and information in this notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

-----------------------------------------------------------------------------------------------------------
## 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [5]:
# Dataframe Variable Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 847329 entries, 0 to 900222
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           847329 non-null  object 
 1   time           847329 non-null  object 
 2   path           847329 non-null  object 
 3   user_id        847329 non-null  int64  
 4   cohort_id      847329 non-null  float64
 5   ip             847329 non-null  object 
 6   name           847329 non-null  object 
 7   start_date     847329 non-null  object 
 8   end_date       847329 non-null  object 
 9   created_at     847329 non-null  object 
 10  program_id     847329 non-null  float64
 11  primary_topic  847329 non-null  object 
 12  subtopic       847329 non-null  object 
 13  tertiary       847329 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 129.2+ MB


In [6]:
# Dataframe Overview
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,program_id,primary_topic,subtopic,tertiary
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,java-ii,object-oriented-programming,
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,1.0,slides,object_oriented_programming,
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2.0,javascript-i,conditionals,


In [7]:
# Splitting dataframe by program into 4 separate dfs
full_stack_php = df[df.program_id == 1]
full_stack_java = df[df.program_id == 2]
data_science = df[df.program_id == 3]
front_end = df[df.program_id == 4]

### Full-Stack PHP Program

In [8]:
# Full-Stack PHP: Count of Modules Accessed by Cohort
php_primary_count = (full_stack_php.groupby('name').primary_topic.value_counts())

In [9]:
# Full-Stack PHP: Percentage of Modules Accessed by Cohort
php_primary_count_pct = (full_stack_php.groupby('name').primary_topic.value_counts(normalize=True))

In [10]:
# Dataframe of Count of Modules Accessed by Cohort
pd.DataFrame(php_primary_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Arches,javascript-i,1333
Arches,html-css,1031
Arches,spring,914
Arches,mysql,775
Arches,java-iii,696
Arches,javascript-ii,672
Arches,jquery,650
Arches,,626
Arches,java-ii,500
Arches,java-i,422


In [11]:
# Dataframe of Percentage of Modules Accessed by Cohort
pd.DataFrame(php_primary_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Arches,javascript-i,0.149944
Arches,html-css,0.115973
Arches,spring,0.102812
Arches,mysql,0.087177
Arches,java-iii,0.07829
Arches,javascript-ii,0.075591
Arches,jquery,0.073116
Arches,,0.070416
Arches,java-ii,0.056243
Arches,java-i,0.047469


In [12]:
# Full-Stack PHP: Count of Lessons Accessed by Cohort
php_sub_count = (full_stack_php.groupby('name').subtopic.value_counts())

In [13]:
# Full-Stack PHP: Percentage of Lessons Accessed by Cohort
php_sub_count_pct = (full_stack_php.groupby('name').subtopic.value_counts(normalize=True))

In [14]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(php_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,,1803
Arches,,626
Arches,fundamentals,574
Arches,introduction,362
Arches,css-ii,320
Arches,css-i,274
Arches,arrays,200
Arches,bom-and-dom,195
Arches,relationships,133
Arches,ajax,120


In [15]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(php_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,,0.202812
Arches,,0.070416
Arches,fundamentals,0.064567
Arches,introduction,0.04072
Arches,css-ii,0.035996
Arches,css-i,0.030821
Arches,arrays,0.022497
Arches,bom-and-dom,0.021935
Arches,relationships,0.014961
Arches,ajax,0.013498


### Full-Stack Java Program

In [16]:
# Full-Stack Java: Count of Modules Accessed by Cohort
java_primary_count = (full_stack_java.groupby('name').primary_topic.value_counts())

In [17]:
# Full-Stack Java: Percentage of Modules Accessed by Cohort
java_primary_count_pct = (full_stack_java.groupby('name').primary_topic.value_counts(normalize=True))

In [18]:
# Dataframe of Count of Modules Accessed by Cohort
pd.DataFrame(java_primary_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Andromeda,javascript-i,4077
Andromeda,mysql,3029
Andromeda,html-css,2865
Andromeda,spring,2850
Andromeda,java-iii,2117
Andromeda,jquery,2101
Andromeda,java-ii,1884
Andromeda,javascript-ii,1340
Andromeda,java-i,1330
Andromeda,,1174


In [19]:
# Dataframe of Percentage of Modules Accessed by Cohort
pd.DataFrame(java_primary_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Andromeda,javascript-i,0.160771
Andromeda,mysql,0.119445
Andromeda,html-css,0.112978
Andromeda,spring,0.112386
Andromeda,java-iii,0.083481
Andromeda,jquery,0.08285
Andromeda,java-ii,0.074293
Andromeda,javascript-ii,0.052841
Andromeda,java-i,0.052447
Andromeda,,0.046295


In [20]:
# Full-Stack Java: Count of Lessons Accessed by Cohort
# Full-Stack Java: Percentage of Lessons Accessed by Cohort
java_sub_count = (full_stack_java.groupby('name').subtopic.value_counts())
java_sub_count_pct = (full_stack_java.groupby('name').subtopic.value_counts(normalize=True))

In [21]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(java_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Andromeda,,4381
Andromeda,fundamentals,1757
Andromeda,introduction,1387
Andromeda,,1176
Andromeda,css-i,982
Andromeda,css-ii,916
Andromeda,arrays,570
Andromeda,bom-and-dom,449
Andromeda,relationships,440
Andromeda,events,422


In [22]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(java_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Andromeda,,0.172759
Andromeda,fundamentals,0.069285
Andromeda,introduction,0.054695
Andromeda,,0.046374
Andromeda,css-i,0.038724
Andromeda,css-ii,0.036121
Andromeda,arrays,0.022477
Andromeda,bom-and-dom,0.017706
Andromeda,relationships,0.017351
Andromeda,events,0.016641


### Data Science Program

In [23]:
# Data Science: Count of Lessons Accessed by Cohort
ds_sub_count = (data_science.groupby('name').subtopic.value_counts())

In [24]:
# Data Science: Percentage of Lessons Accessed by Cohort
ds_sub_count_pct = (data_science.groupby('name').primary_topic.value_counts(normalize=True))

In [25]:
# Dataframe of Count of Lessons Accessed by Cohort
pd.DataFrame(ds_sub_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Bayes,,1967
Bayes,1-overview,1826
Bayes,project,1024
Bayes,modern-data-scientist.jpg,674
Bayes,AI-ML-DL-timeline.jpg,672
Bayes,4-explore,652
Bayes,1.1-intro-to-data-science,640
Bayes,search_index.json,589
Bayes,cli,553
Bayes,3-prep,431


In [26]:
# Dataframe of Percentage of Lessons Accessed by Cohort
pd.DataFrame(ds_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,primary_topic
name,primary_topic,Unnamed: 2_level_1
Bayes,1-fundamentals,0.117718
Bayes,6-regression,0.096955
Bayes,4-python,0.090135
Bayes,3-sql,0.087158
Bayes,,0.07412
Bayes,7-classification,0.073291
Bayes,5-stats,0.072424
Bayes,8-clustering,0.050192
Bayes,appendix,0.046914
Bayes,10-anomaly-detection,0.045218


### Question 1: Answer
From our exploration, we've determined that the lessons that attract the most traffic consistently across cohorts are the fundamentals and concept introduction pages. Regardless of program, it seems that these initial pages, and associated fundamental concepts are most visited across cohorts.

-----------------------------------------------------------------------------------------------------------
## 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?


Analysis performed above answering the first question can also be used to answer this second question. 

**Takeaways: full stack php**
1. laravel is referred to significantly more by 'Kings' cohort than other cohorts; most of whom gloss over it.

**Takeaways: full stack java**
1. No outstanding differences within full stack java cohorts (all are rather uniform)

**Takeaways: data science**
1. Cohorts Darden and Easley referred to 'scale_features_or_not' significantly more than other cohorts

### Question 2: Answer
From our exploration, we've determined:
- Full-Stack Java Cohorts: No outstanding differences between cohorts (all are uniform)
- Full-Stack PHP Cohorts: Kings cohort refers to 'laravel' significantly more than other cohorts
- Data Science Cohorts: Darden and Easley cohorts refer to 'scale_features_or_not' significantly more than other cohorts

-----------------------------------------------------------------------------------------------------------
## 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [27]:
# Creating df that only pulls data of students who are actively in their class period
active_students = df[(df.date <= df.end_date)|(df.date >= df.start_date)]

In [28]:
# describe value counts of user_id, shows hits
active_students.user_id.value_counts().describe()

count    911.000000  
mean     930.108672  
std      1236.014277 
min      1.000000    
25%      193.000000  
50%      697.000000  
75%      1298.500000 
max      17913.000000
Name: user_id, dtype: float64

In [29]:
# Because the user_id is used on multiple rows, set it as index so it can be explored
active_students = active_students.set_index('user_id')

In [30]:
# Simplify dataframe, drop unused columns
active_students = active_students.drop(columns=['start_date','end_date','created_at','date','time'])

In [31]:
#  Shows simplified df
active_students.head()

Unnamed: 0_level_0,path,cohort_id,ip,name,program_id,primary_topic,subtopic,tertiary
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,/,8.0,97.105.19.61,Hampton,1.0,,,
1,java-ii,8.0,97.105.19.61,Hampton,1.0,java-ii,,
1,java-ii/object-oriented-programming,8.0,97.105.19.61,Hampton,1.0,java-ii,object-oriented-programming,
1,slides/object_oriented_programming,8.0,97.105.19.61,Hampton,1.0,slides,object_oriented_programming,
2,javascript-i/conditionals,22.0,97.105.19.61,Teddy,2.0,javascript-i,conditionals,


In [45]:
# Utilizing simplified dataframe, we setup where we draw the line for limited use of active students, around 150 hits, below the lower IQR of 193
low_use_active_students = active_students[active_students.index.value_counts() <=150]
low_use_active_students.head()

Unnamed: 0_level_0,path,cohort_id,ip,name,program_id,primary_topic,subtopic,tertiary
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13,javascript-i/functions,22.0,97.105.19.61,Teddy,2.0,javascript-i,functions,
21,index.html,16.0,71.137.146.71,Niagara,2.0,index.html,,
21,java-i,16.0,71.137.146.71,Niagara,2.0,java-i,,
24,appendix,21.0,97.105.19.61,Sequoia,2.0,appendix,,
24,appendix/capstone-workbook,21.0,97.105.19.61,Sequoia,2.0,appendix,capstone-workbook,


In [44]:
# Visualizing simplified dataframe showing low use active students by cohort 
(pd.DataFrame(active_students[active_students.index.value_counts() <=150].groupby('user_id').name.value_counts())).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,name
user_id,name,Unnamed: 2_level_1
13,Teddy,94
21,Niagara,107
24,Sequoia,26
45,Sequoia,65
46,Teddy,54
49,Sequoia,25
55,Sequoia,95
65,Sequoia,107
66,Kings,1
69,Lassen,67


### Question 3: Answer
From our exploration, we've determined that 

Can we see how often these same students utilized the curriculum after they graduated?

-----------------------------------------------------------------------------------------------------------
## 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

### Question 5: Answer
From our exploration, we've determined that 

-----------------------------------------------------------------------------------------------------------
## 6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?

### Question 6: Answer
From our exploration, we've determined that 

-----------------------------------------------------------------------------------------------------------
## 7. Which lessons are least accessed?

In [None]:
# Full-Stack PHP: Percentage of Lessons Accessed by Cohort
php_sub_count_pct = (full_stack_php.groupby('name').subtopic.value_counts(normalize=True, ascending=True))
pd.DataFrame(php_sub_count_pct)

Unnamed: 0_level_0,Unnamed: 1_level_0,subtopic
name,subtopic,Unnamed: 2_level_1
Arches,getting-started,0.000112
Arches,mapbox-api,0.000112
Arches,promisesdfghjkjhgfs,0.000112
Arches,units,0.000112
Arches,5762c2946250b.jpg,0.000225
Arches,exceptions_and_error_handling,0.000225
Arches,file_io,0.000225
Arches,versioning,0.000225
Arches,vocab,0.000225
Arches,javascript_ii,0.000337


In [None]:
# Full-Stack Java: Percentage of Lessons Accessed by Cohort
java_sub_count_pct = (full_stack_java.groupby('name').subtopic.value_counts(normalize=True))

In [None]:
# Data Science: Percentage of Lessons Accessed by Cohort
ds_sub_count_pct = (data_science.groupby('name').primary_topic.value_counts(normalize=True))

### Question 7: Answer
From our exploration, we've determined that 