In [28]:
from sqlalchemy import create_engine
import pymysql.cursors
import pandas as pd
from pyudemy import Udemy
import getpass
import requests
import time
import os

In [30]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)

In [35]:
def get_df_info(course_data):
    
    dicts_list = []
    
    for course in course_data['results']:
        row = {}
        row['course_id'] = course['id']
        row['title'] = course['title']
        row['published_time'] = course['published_time']
        row['num_subscribers'] = course['num_subscribers'] 
        row['price'] = course['price']
        row['discount'] = course['discount']
        row['discount_price'] = course['discount_price']
        
        row['avg_rating'] = course['avg_rating']
        row['avg_recent_rating'] = course['avg_rating_recent']
        row['num_reviews'] = course['num_reviews']  
        row['num_lectures'] = course['num_lectures']
        row['num_quizzes'] = course['num_quizzes']
        row['is_practice_test_course'] = course['is_practice_test_course']
        
        row['language'] = course['locale']['title']
        row['content_length'] = course['estimated_content_length']
        
        row['primary_category'] = course['primary_category']['title']
        row['primary_subcategory'] = course['primary_subcategory']['title']
        
        row['level'] = course['instructional_level_simple']
        row['instructor'] = course['visible_instructors'][0]['display_name']
        row['instructor_job_title'] = course['visible_instructors'][0]['job_title']
        row['headline'] = course['headline']
        row['description'] = course['description']
        
        row['url'] = course['url']
        
        dicts_list.append(row)
        
    return pd.DataFrame(dicts_list)
    

In [36]:
def get_courses_info(first_page,last_page):
    
    df = pd.DataFrame(columns=['course_id', 'title', 'published_time', 'num_subscribers', 'price',
       'discount', 'discount_price', 'avg_rating', 'avg_recent_rating',
       'num_reviews', 'num_lectures', 'num_quizzes', 'is_practice_test_course',
       'language', 'content_length', 'primary_category', 'primary_subcategory',
       'level', 'instructor', 'instructor_job_title', 'headline',
       'description', 'url'])
    
    for i in range(first_page, last_page+1):
        url = f'https://www.udemy.com/api-2.0/courses/?page={i}&page_size=100&fields[course]=id,title,published_time,'\
        'num_subscribers,price,discount,discount_price,avg_rating,avg_rating_recent,num_reviews,num_lectures,num_quizzes,'\
        'is_practice_test_course,locale,estimated_content_length,primary_category,primary_subcategory,'\
        'instructional_level_simple,visible_instructors,headline,description,url'
        course = requests.get(url,auth=auth).json()
        df_page = get_df_info(course)
        df = pd.concat([df_page, df])
        time.sleep(2)
    
    return df

In [37]:
def get_courses_cat(first_page,last_page):
    
    df = pd.DataFrame(columns=['course_id', 'title', 'published_time', 'num_subscribers', 'price',
       'discount', 'discount_price', 'avg_rating', 'avg_recent_rating',
       'num_reviews', 'num_lectures', 'num_quizzes', 'is_practice_test_course',
       'language', 'content_length', 'primary_category', 'primary_subcategory',
       'level', 'instructor', 'instructor_job_title', 'headline',
       'description', 'url'])
    
    # Dropped 'Marketing' as a category 
    
    categories = ['Business', 'Design', 'Development', 'Finance+%26+Accounting', 'Health+%26+Fitness', 'IT+%26+Software', 
                  'Lifestyle','Music', 'Office+Productivity','Personal+Development', 'Photography+%26+Video', 
                  'Teaching+%26+Academics']
    
    for cat in categories:
        for i in range(first_page, last_page+1):
            url = f'https://www.udemy.com/api-2.0/courses/?category={cat}&page={i}&page_size=100&fields[course]=id,title,published_time,'\
            'num_subscribers,price,discount,discount_price,avg_rating,avg_rating_recent,num_reviews,num_lectures,num_quizzes,'\
            'is_practice_test_course,locale,estimated_content_length,primary_category,primary_subcategory,'\
            'instructional_level_simple,visible_instructors,headline,description,url'
            course = requests.get(url,auth=auth).json()
            df_page = get_df_info(course)
            df = pd.concat([df_page, df])
            time.sleep(2)
            print(cat)
    
    return df

In [38]:
def get_courses_cat1(first_page,last_page):
    
    pw = os.getenv('mysql')
    connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
    engine = create_engine(connection_string)
    
    df = pd.DataFrame(columns=['course_id', 'title', 'published_time', 'num_subscribers', 'price',
       'discount', 'discount_price', 'avg_rating', 'avg_recent_rating',
       'num_reviews', 'num_lectures', 'num_quizzes', 'is_practice_test_course',
       'language', 'content_length', 'primary_category', 'primary_subcategory',
       'level', 'instructor', 'instructor_job_title', 'headline',
       'description', 'url'])
    
    # Dropped 'Marketing', 'Photography and Video' as categories 
    
    categories = ['Business', 'Design', 'Development', 'Finance+%26+Accounting', 'Health+%26+Fitness', 'IT+%26+Software', 
                  'Lifestyle','Music', 'Office+Productivity','Personal+Development','Teaching and Academics']
    
    
    
    for cat in categories:
        for i in range(first_page, last_page+1):
            url = f'https://www.udemy.com/api-2.0/courses/?category={cat}&page={i}&page_size=100&fields[course]=id,title,published_time,'\
            'num_subscribers,price,discount,discount_price,avg_rating,avg_rating_recent,num_reviews,num_lectures,num_quizzes,'\
            'is_practice_test_course,locale,estimated_content_length,primary_category,primary_subcategory,'\
            'instructional_level_simple,visible_instructors,headline,description,url'
            course = requests.get(url,auth=auth).json()
            df_page = get_df_info(course)
            df = pd.concat([df_page, df])
            time.sleep(2)
            print(cat)
        
        df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)
    
    return df

In [113]:
def get_df_reviews(review_data):
    
    dicts_list = []
    
    for review in review_data['results']:
        row = {}
        row['review_id'] = review['id']
        row['rating'] = review['rating']
        row['comment'] = review['content']
        row['created_time'] = review['created'] 
        row['user'] = review['user']['display_name']
        
        dicts_list.append(row)
        
    return pd.DataFrame(dicts_list)

In [114]:
def get_reviews(id_list,first_page,last_page):
    
    df = pd.DataFrame(columns=['review_id', 'rating', 'comment', 'created_time', 'user', 'course_id'])
    
    for course_id in id_list:
        for i in range(first_page, last_page+1):
            url = f'https://www.udemy.com/api-2.0/courses/{course_id}/reviews/?page={i}&page_size=100'
            course = requests.get(url,auth=auth).json()
            df_page = get_df_reviews(course)
            df_page['course_id'] = course_id
            df = pd.concat([df_page, df])
            time.sleep(2)
        print(course_id)
    
    return df

In [202]:
def get_df_curriculum(curriculum):
    
    dicts_list = []
    row = {}
    n = 1
    for session in curriculum['results']:
        row[f'type_session_{n}'] = session['_class']
        row[f'title_session_{n}'] = session['title']
        
        n = n + 1
        
    dicts_list.append(row)
        
    return pd.DataFrame(dicts_list)

In [203]:
def get_curricula(id_list,first_page,last_page):
    
    df = pd.DataFrame(columns=['type_session_1', 'title_session_1', 'type_session_2', 'title_session_2',
                               'type_session_3', 'title_session_3', 'type_session_4', 'title_session_4'])
    
    for course_id in id_list:
        for i in range(first_page, last_page+1):
            url = f'https://www.udemy.com/api-2.0/courses/{course_id}/public-curriculum-items/?page=1&page_size=4'
            course = requests.get(url,auth=auth).json()
            df_page = get_df_curriculum(course)
            df_page['course_id'] = course_id
            df = pd.concat([df_page, df])
            time.sleep(2)
        print(course_id)
        
    df['course_id'] = df['course_id'].astype(int)
    
    return df

In [9]:
Client_ID = getpass.getpass()

········


In [10]:
Client_Secret = getpass.getpass()

········


In [14]:
udemy = Udemy('zUBi3x7r46IBTwt5H5PPu8NRvxgQvy8F0ep2if3b', 'BCezCskbH1EBkOE8kVlo1QyaMd8eyFONKvS0sBpZrRibwsujmVvxxxXkXGpPni2f0d8ikbpKdNOTaOl5KMBjIwpoWARZPq0GrtKrj85zxOVpbDHwuvOqSpSbsaMvqlXn')

In [54]:
auth = ('zUBi3x7r46IBTwt5H5PPu8NRvxgQvy8F0ep2if3b', 'BCezCskbH1EBkOE8kVlo1QyaMd8eyFONKvS0sBpZrRibwsujmVvxxxXkXGpPni2f0d8ikbpKdNOTaOl5KMBjIwpoWARZPq0GrtKrj85zxOVpbDHwuvOqSpSbsaMvqlXn')

In [None]:
# Getting the a1 and a2 batches

In [None]:
# Batch a1

In [95]:
df_courses_a1 = get_courses(1,50)

In [96]:
df_courses_a1

Unnamed: 0,course_id,title,url,is_paid,price,is_practice_test_course,headline,instructor,instructor_job_title
0,2926876,Guitar Lessons for Beginners (In Hindi),/course/playandsing/,False,Free,False,World needs more Musicians.,Vishal Diwan,Nikon School Mentor - India
1,4830898,JavaScript - Intermediate & Advanced (2022),/course/sharecodecamp-javascript/,False,Free,False,Understanding Behind The Scene!...,Enes Karakaş,Developer & @shareCodeCamp
2,3809298,Problem solving techniques,/course/problem-solving-techniques-n/,False,Free,False,A comprehensive guide to systematic thinking f...,Mahmoud Elhalabi,Doctorate of business adminstration - Training...
3,4810282,Learn how to Create Beats in Fl Studio,/course/learn-how-to-create-beats-in-fl-studio/,False,Free,False,Start your journey of Making Music Live,Divine YJ Truth,I am a Producer
4,3538116,Start Money Making WordPress Blog Today!,/course/create-a-website-blog-for-affiliate-ma...,False,Free,False,Start your Affiliate Marketing Website by foll...,Azharul Rafy,Content Creator & Digital Marketer
...,...,...,...,...,...,...,...,...,...
95,467294,Introduction to SDN and OpenFlow,/course/sdn-openflow-nfv-introduction/,False,Free,False,"What is SDN, OpenFlow and NFV? Is this actuall...",David Bombal,"CCIE #11023, over 15 years of network training..."
96,5081160,Working Remotely Secrets: 10X Your Focus in Ju...,/course/working-remotely-mastery/,False,Free,False,The Professional Guide To Working Remotely,Silviu Marisk - Effective Learning Lab,"Online Instructor Teaching 18 Courses and 200,..."
97,390910,Fundamentals of Programming: Understanding C#,/course/understandingc/,False,Free,False,Begin your journey into the world of programmi...,Jesse Dietrichson,Senior Content Developer at Microsoft
98,217000,Learn to Program in Javascript: Beginner to Pro,/course/programming-in-javascript/,False,Free,False,"Don't just be a good programmer, become a grea...",Raghavendra Dixit,Dizauvi Learning Solutions


In [97]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_a1.to_sql("df_courses_a1", connection_string, schema='udemy', if_exists='replace', index = False)

5000

In [None]:
# Batch a2

In [93]:
df_courses_a2 = get_courses(51,83)

In [94]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_a2.to_sql("df_courses_a2", connection_string, schema='udemy', if_exists='replace', index = False)

3300

In [None]:
# Getting list of course ids from batch 1 and 2

In [110]:
concat = pd.concat([df_courses_a1, df_courses_a2])

In [111]:
concat = concat.drop_duplicates()

In [112]:
concat['course_id'].duplicated().sum()

1

In [107]:
concat = concat[~(concat['course_id'] == 3693246) & (concat['price'] == '€19.99')] #drop duplicated Course ID

In [109]:
concat['course_id'].duplicated().sum()

0

In [116]:
course_id_list = list(concat['course_id']) #getting a list of course ids

In [None]:
# Getting batches A and B and C through course list

In [7]:
df_courses_a = get_courses_info(1,50) #batch A

In [10]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_a.to_sql("df_courses_a", connection_string, schema='udemy', if_exists='replace', index = False)

5000

In [349]:
df_courses_b = get_courses_info(51,80) #batch B

In [350]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_b.to_sql("df_courses_b", connection_string, schema='udemy', if_exists='replace', index = False)

3000

In [359]:
df_courses_c = get_courses_info(81,84) #batch C

In [360]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_c.to_sql("df_courses_c", connection_string, schema='udemy', if_exists='replace', index = False)

380

In [None]:
# Getting data based on categories

In [None]:
# Batch D

In [39]:
df_courses_d = get_courses_cat(1,10)

Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Music
Music
Music
Music
Music
Music
Musi

In [40]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
df_courses_d.to_sql("df_courses_d", connection_string, schema='udemy', if_exists='replace', index = False)

12000

In [None]:
# Batch E (split into some and not all categories)

In [12]:
df_courses_e = get_courses_cat1(11,50)

Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business
Business


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design
Design


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development
Development


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting
Finance+%26+Accounting


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness
Health+%26+Fitness


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software
IT+%26+Software


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle
Lifestyle


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music
Music


  df.to_sql("df_{}".format(cat), connection_string, schema='udemy', if_exists='replace', index = False)


Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity
Office+Productivity


IndexError: list index out of range

In [None]:
# Get course reviews

In [139]:
df_compiled = pd.read_sql("SELECT * FROM udemy.df_compiled", engine)
medium_courses = df_compiled[df_compiled['num_subscribers'] >= 10000].sort_values(['num_subscribers'], ascending=True).head(100)
top_courses = df_compiled.sort_values(['num_subscribers'], ascending=False).head(100)
ids_medium = list(medium_courses['course_id'])
ids_top = list(top_courses['course_id'])

In [122]:
reviews_medium = get_reviews(ids_medium,1,1) # get reviews for courses with num_subscribers > 10000

2245856
4544060
84085
5307690
1474372
4751096
1400182
2613130
2476226
2377826
3084582
1335866
1116196
3726470
2126948
1090700
2843784
4230454
674982
2069451
1790584
2195846
2245896
1054970
164958
4572510
5124570
4806466
1204784
2266746
3806774
1152486
4874234
1747820
3618502
4462360
25671
3321318
2361148
1333372
4481730
465334
33215
349334
1442960
1420280
1804654
835878
3481152
1409860
309908
3185832
3181796
1362338
840958
2019474
1222364
4688270
977668
1775452
4587280
4149968
914220
2360166
763258
3756876
2214444
2548797
2405916
628786
58838
1824622
3075394
1747836
1414568
905286
542748
1610356
1107870
448830
2381903
4339776
606590
3897010
5037450
3990790
2419768
2197122
2300050
1470278
1434318
343262
4103280
2133816
2458906
1913646
1423634
4707388
353158
4256776


In [123]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
reviews_medium.to_sql("df_reviews_medium", connection_string, schema='udemy', if_exists='replace', index = False)

9190

In [146]:
ids_top_a = ids_top[0:50] #Getting first batch of reviews for most popular courses

In [147]:
reviews_top_a = get_reviews(ids_top_a,1,10)

24823
543600
1565838
950390
433798
625204
851712
1362070
473160
247190
146156
53600
2707184
552672
1137162
648826
437398
707962
133536
238934
387820
2394982
405926
1325686
937678
1462428
605006
192004
9711
917596
797156
3726582
466000
1331946
11331
399938
1415652
1189136
1731874
594360
2027098
3033186
1793828
1495788
42271
22169
3105814
3434032
780078
1708340


In [148]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
reviews_top_a.to_sql("df_reviews_top_a", connection_string, schema='udemy', if_exists='replace', index = False)

50000

In [150]:
ids_top_b = ids_top[50:100] # Getting second batch of review for most popular courses

In [151]:
reviews_top_b = get_reviews(ids_top_b,1,10)

3011572
584648
130064
2769314
173548
1703802
1351634
3663284
171838
2795746
3033182
3033110
2861796
333610
1879018
65330
673654
2310306
500632
591930
382002
15639
3559007
2410958
1704776
1643044
3617896
39115
1920686
217000
671544
311538
25584
236676
2602724
1917546
38282
751094
3694430
888716
74092
775330
2971820
366280
580342
3406388
1759114
3213441
151198
3833784


In [152]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
reviews_top_b.to_sql("df_reviews_top_b", connection_string, schema='udemy', if_exists='replace', index = False)

50000

In [16]:
udemy.course_reviews(24823)

{'count': 10000,
 'next': None,
 'previous': None,
 'results': [{'_class': 'course_review',
   'id': 151272944,
   'content': 'good',
   'rating': 5.0,
   'created': '2023-08-23T23:19:48-07:00',
   'modified': '2023-08-24T16:01:52-07:00',
   'user_modified': '2023-08-24T05:31:23-07:00',
   'user': {'_class': 'user',
    'title': 'Omkar Deshpande',
    'name': 'Omkar',
    'display_name': 'Omkar Deshpande'}},
  {'_class': 'course_review',
   'id': 151272186,
   'content': '',
   'rating': 4.0,
   'created': '2023-08-23T23:09:17-07:00',
   'modified': '2023-08-24T16:01:52-07:00',
   'user_modified': '2023-08-23T23:09:18-07:00',
   'user': {'_class': 'user',
    'title': 'Insane Flick',
    'name': 'Insane',
    'display_name': 'Insane Flick'}},
  {'_class': 'course_review',
   'id': 151268352,
   'content': '',
   'rating': 4.0,
   'created': '2023-08-23T22:09:11-07:00',
   'modified': '2023-08-24T16:01:52-07:00',
   'user_modified': '2023-08-23T22:09:17-07:00',
   'user': {'_class': 'us

In [None]:
# Getting curricula for courses with num_subscribers > 10000

In [204]:
curricula_medium = get_curricula(ids_medium,1,1)

2245856
4544060
84085
5307690
1474372
4751096
1400182
2613130
2476226
2377826
3084582
1335866
1116196
3726470
2126948
1090700
2843784
4230454
674982
2069451
1790584
2195846
2245896
1054970
164958
4572510
5124570
4806466
1204784
2266746
3806774
1152486
4874234
1747820
3618502
4462360
25671
3321318
2361148
1333372
4481730
465334
33215
349334
1442960
1420280
1804654
835878
3481152
1409860
309908
3185832
3181796
1362338
840958
2019474
1222364
4688270
977668
1775452
4587280
4149968
914220
2360166
763258
3756876
2214444
2548797
2405916
628786
58838
1824622
3075394
1747836
1414568
905286
542748
1610356
1107870
448830
2381903
4339776
606590
3897010
5037450
3990790
2419768
2197122
2300050
1470278
1434318
343262
4103280
2133816
2458906
1913646
1423634
4707388
353158
4256776


In [206]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
curricula_medium.to_sql("df_curricula_medium", connection_string, schema='udemy', if_exists='replace', index = False)

100

In [207]:
curricula_top = get_curricula(ids_top,1,1) # Getting curricula for most popular courses

24823
543600
1565838
950390
433798
625204
851712
1362070
473160
247190
146156
53600
2707184
552672
1137162
648826
437398
707962
133536
238934
387820
2394982
405926
1325686
937678
1462428
605006
192004
9711
917596
797156
3726582
466000
1331946
11331
399938
1415652
1189136
1731874
594360
2027098
3033186
1793828
1495788
42271
22169
3105814
3434032
780078
1708340
3011572
584648
130064
2769314
173548
1703802
1351634
3663284
171838
2795746
3033182
3033110
2861796
333610
1879018
65330
673654
2310306
500632
591930
382002
15639
3559007
2410958
1704776
1643044
3617896
39115
1920686
217000
671544
311538
25584
236676
2602724
1917546
38282
751094
3694430
888716
74092
775330
2971820
366280
580342
3406388
1759114
3213441
151198
3833784


In [208]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
curricula_top.to_sql("df_curricula_top", connection_string, schema='udemy', if_exists='replace', index = False)

100

In [None]:
# Select and concatenate relevant tables for storage in udemy_final

In [209]:
compiled = pd.read_sql("SELECT * FROM udemy.df_compiled", engine)

top_a = pd.read_sql("SELECT * FROM udemy.df_reviews_top_a", engine)

top_b = pd.read_sql("SELECT * FROM udemy.df_reviews_top_b", engine)

medium = pd.read_sql("SELECT * FROM udemy.df_reviews_medium", engine)

curricula_t = pd.read_sql("SELECT * FROM udemy.df_curricula_top", engine)

curricula_m = pd.read_sql("SELECT * FROM udemy.df_curricula_medium", engine)

In [210]:
reviews_top = pd.concat([top_a,top_b])

In [213]:
pw = os.getenv('mysql')
connection_string = 'mysql+pymysql://root:' + pw + '@localhost:3306/'
engine = create_engine(connection_string)
compiled.to_sql("course_list", connection_string, schema='udemy_final', if_exists='replace', index = False)

reviews_top.to_sql("reviews_group1", connection_string, schema='udemy_final', if_exists='replace', index = False)

medium.to_sql("reviews_group2", connection_string, schema='udemy_final', if_exists='replace', index = False)

curricula_t.to_sql("curricula_group1", connection_string, schema='udemy_final', if_exists='replace', index = False)

curricula_m.to_sql("curricula_group2", connection_string, schema='udemy_final', if_exists='replace', index = False)

100