In [None]:
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#loading the dataset
dataset=pd.read_csv('/kaggle/input/coursera-data/coursera_course_data.csv')
dataset.head(10)
dataset.info()
dataset.isnull().sum()
plt.figure(figsize=(10,5))
sns.countplot(data=dataset,x='Rating')
plt.xlabel('Course Ratings')
plt.ylabel('Number of courses')
plt.title("Count of course types\n")
plt.figure(figsize=(27,10))
sns.countplot(data=dataset,x='Subject')
plt.xlabel('Subjects/Domains')
plt.ylabel('Number of courses')
plt.title("Count of course types\n")
mean_rating = dataset['Rating'].mean()
median_rating = dataset['Rating'].median()
mode_rating = dataset['Rating'].mode()
std_dev_rating = dataset['Rating'].std()
variance_rating = dataset['Rating'].var()

print("Mean Rating:", mean_rating)
print("Median Rating:", median_rating)
print("Mode Rating:", mode_rating)
print("Standard Deviation of Rating:", std_dev_rating)
print("Variance of Rating:", variance_rating)

sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))

sns.boxplot(x=dataset['Rating'])

Q1 = dataset['Rating'].quantile(0.25)
Q3 = dataset['Rating'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#box plot
plt.scatter([], [], c='red', label='Outliers')
plt.legend()
plt.show()

dataset = dataset[(dataset['Rating'] >= lower_bound) & (dataset['Rating'] <= upper_bound)]



dataset.info()
dataset['Organisation'].value_counts()
dataset['Organisation'].nunique()
dataset['Rating'].value_counts()
plt.figure(figsize=(10,5))
sns.countplot(data=dataset,x='Rating')
plt.xlabel('Course Ratings')
plt.ylabel('Number of courses')
plt.title("Count of course types\n")


level_counts = dataset['Level'].value_counts()
print(level_counts)

plt.figure(figsize=(8, 8))
plt.pie(level_counts, labels=level_counts.index,  startangle=0)
plt.title('Distribution of Course Levels')
plt.show()
plt.scatter(dataset['Rating'], dataset['Reviews_count'])
plt.xlabel('Rating')
plt.ylabel('Reviews Count')
plt.title('Scatter Plot between Ratings and Reviews Count')
plt.show()
top_20_organisations = dataset['Organisation'].value_counts().head(10).index
filtered_dataset = dataset[dataset['Organisation'].isin(top_20_organisations)]

plt.scatter(filtered_dataset['Rating'], filtered_dataset['Organisation'])
plt.xlabel('Rating')
plt.ylabel('Organisation')
plt.title('Scatter Plot between Ratings and Organisation (Top 20)')
plt.show()
dataset.head()
# Drop rows with any missing values
dataset = dataset.dropna()

# Reset the index after dropping rows
dataset = dataset.reset_index(drop=True)

dataset.isnull().sum()
data = dataset[['Course','Subject','Organisation','Rating','Level','Description','Skills']]
data.info()

data['Description'] = data['Description'].str.replace(' ',',')
data['Description'] = data['Description'].str.replace(',,',',')
data['Description'] = data['Description'].str.replace('_','')
data['Description'] = data['Description'].str.replace(':','')
data['Description'] = data['Description'].str.replace('(','')
data['Description'] = data['Description'].str.replace(')','')

# Removing parentheses from skills columns
data['Skills'] = data['Skills'].str.replace('(','')
data['Skills'] = data['Skills'].str.replace(')','')



data.head()
# Concatenate information from multiple columns into a new 'tags' column
data['tags'] = data['Organisation'] + ','+ data['Level'] + ','+ data['Description']
data['tags2'] =data['Course'] + ','+ data['Skills']+','+data['Subject']
data.head()
new_df = data[['Course','Subject','Organisation','tags','tags2','Rating']]
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df['tags2'] = new_df['tags2'].apply(lambda x:x.lower())
new_df.head()
new_df['Course'] = data['Course'].str.replace(',',' ')
new_df['tags'] = data['tags'].str.replace(',',' ')
new_df['tags2'] = data['tags2'].str.replace(',',' ')
new_df.head()
new_df.isnull().sum()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors2 = cv.fit_transform(new_df['tags2']).toarray()

import nltk 
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags2'] = new_df['tags2'].apply(stem)
from sklearn.metrics.pairwise import cosine_similarity
similarity1 = cosine_similarity(vectors)
similarity2 = cosine_similarity(vectors2)
similarity = 0.2 * similarity1 + 0.8 * similarity2

def recommend(course):
    
    course_index = new_df[new_df['Course'] == course].index[0]
    
    #similarity scores
    distances = similarity[course_index]
    
    # Find courses with the highest similarity scores
    similar_courses = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:7]

    #print("Top  Recommended Courses based on Similarity:")
    for i in similar_courses:
        recommended_course_index = i[0]
        recommended_course_similarity = i[1]
        
        recommended_course_name = new_df.iloc[recommended_course_index]['Course']

        
        print(f"Recommended Course: {recommended_course_name}")
        #print(f"Similarity Score: {recommended_course_similarity}")
       
        print("\n")

    
    same_sub_courses = new_df[new_df['Subject'] == new_df.iloc[course_index]['Subject']].nlargest(3, 'Rating')
    same_org_courses = new_df[new_df['Organisation'] == new_df.iloc[course_index]['Organisation']].nlargest(3, 'Rating')

   
    for _, row in same_org_courses.iterrows():
        print(f"Recommended Course: {row['Course']}")


        print("\n")
    for _, row in same_sub_courses.iterrows():
        print(f"Recommended Course: {row['Course']}")


        print("\n")
# Example usage
recommend('IBM Data Science')

recommend('Entrepreneurship: Growing Your Business') 
recommend('The Business of Health Care')

recommend('Supervised Machine Learning: Regression and Classification')

recommend('Deep Learning with PyTorch : Image Segmentation')
recommend('Writing and Editing: Structure and Organization')
recommend('Neural Networks and Deep Learning')