In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nltk
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
file_path = '/content/drive/MyDrive/Coursera_reviews.csv'
df = pd.read_csv(file_path, lineterminator='\n')

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment_rating(sentiment_polarity):
    s=2*(sentiment_polarity+1)+1
    return s

In [None]:
def get_sentiment_polarity(review):
    sentiment_score = sia.polarity_scores(str(review))
    return sentiment_score['compound']

In [None]:
def linear_scaling(sentiment_polarity):
    return 2.5 * (sentiment_polarity + 1)


In [None]:
def piecewise_linear_scaling(sentiment_polarity):
    if sentiment_polarity < -0.1:
        return max(1, 2.97 + 3 * (sentiment_polarity + 0.1))  # Scale from 1 upwards, less penalty for negatives
    elif sentiment_polarity < 0.35:
        return 3 + 2.25 * (sentiment_polarity + 0.1)  # Expanded middle range, adjust to use more of 3-4 range
    else:
        return 4 + (sentiment_polarity - 0.35) * 1.538  # Easier access to high ratings, but cap at 5


In [None]:
import numpy as np

def exponential_scaling(sentiment_polarity):
    # Adjust base to control growth rate, focusing scale around typical sentiment polarity range (-1 to 1)
    return 2.5 + (3 * np.tanh(sentiment_polarity))


In [None]:
import numpy as np

def quantile_scale(x, quantiles):
    if x <= quantiles[0]:
        return 1.5
    elif x <= quantiles[1]:
        return 2.5
    elif x <= quantiles[2]:
        return 3.5
    elif x <= quantiles[3]:
        return 4
    else:
        return 5

In [None]:
def sigmoid_scaling(sentiment_polarity):
    # Adjusting sigmoid to scale between 0 and 5
    return 1 / (1 + np.exp(-10 * (sentiment_polarity))) * 5


In [None]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Assuming nltk VADER's SentimentIntensityAnalyzer is already set up
sia = SentimentIntensityAnalyzer()

# Function to get sentiment polarity using VADER
def get_sentiment_polarity_vader(review):
    sentiment_score = sia.polarity_scores(str(review))
    return sentiment_score['compound']

# Function to get sentiment polarity using TextBlob
def get_sentiment_polarity_textblob(review):
    # Create a TextBlob object
    blob = TextBlob(str(review))
    # Return the polarity
    return blob.sentiment.polarity

# Load your data
file_path = '/content/drive/MyDrive/Coursera_reviews.csv'
df = pd.read_csv(file_path, lineterminator='\n')

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_file_with_rating.csv', index=False)


In [None]:
# Apply VADER sentiment analysis
df['vader_sentiment_polarity'] = df['reviews'].apply(get_sentiment_polarity_vader)
df['vader_sentiment_rating'] = df['vader_sentiment_polarity'].apply(get_sentiment_rating)

# Apply TextBlob sentiment analysis
df['textblob_sentiment_polarity'] = df['reviews'].apply(get_sentiment_polarity_textblob)
df['textblob_sentiment_rating'] = df['textblob_sentiment_polarity'].apply(get_sentiment_rating)

In [None]:
df['vader_sentiment_rating_linear'] = df['vader_sentiment_polarity'].apply(linear_scaling)
df['textblob_sentiment_rating_linear'] = df['textblob_sentiment_polarity'].apply(linear_scaling)
# df.to_csv('updated_file_with_rating_linear.csv', index=False)

In [None]:
df['vader_sentiment_rating_piecewise'] = df['vader_sentiment_polarity'].apply(piecewise_linear_scaling)
df['textblob_sentiment_rating_piecewise'] = df['textblob_sentiment_polarity'].apply(piecewise_linear_scaling)
# df.to_csv('updated_file_with_rating_piecewise_linear.csv', index=False)

In [None]:
df['vader_sentiment_rating_exp'] = df['vader_sentiment_polarity'].apply(exponential_scaling)
df['textblob_sentiment_rating_exp'] = df['textblob_sentiment_polarity'].apply(exponential_scaling)
df.to_csv('updated_file_with_rating_exp.csv', index=False)
df.head(20)

Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id,vader_sentiment_polarity,vader_sentiment_rating,textblob_sentiment_polarity,textblob_sentiment_rating,vader_sentiment_rating_linear,textblob_sentiment_rating_linear,vader_sentiment_rating_piecewise,textblob_sentiment_rating_piecewise,vader_sentiment_rating_exp,textblob_sentiment_rating_exp,vader_quantile_scaled_rating,textblob_quantile_scaled_rating,vader_sentiment_rating_sigmoid,textblob_sentiment_rating_sigmoid
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4,google-cbrs-cpi-training,0.644,4.288,0.159848,3.319697,4.11,2.899621,4.452172,3.584659,4.202851,2.975502,4.0,2.5,4.992031,4.159033
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4,google-cbrs-cpi-training,0.4404,3.8808,0.5,4.0,3.601,3.75,4.139035,4.2307,3.741928,3.886351,2.5,4.0,4.939597,4.966536
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4,google-cbrs-cpi-training,0.6572,4.3144,0.1775,3.355,4.143,2.94375,4.472474,3.624375,4.229491,3.026977,4.0,2.5,4.993015,4.275392
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4,google-cbrs-cpi-training,0.4633,3.9266,0.05,3.1,3.65825,2.625,4.174255,3.3375,3.79831,2.649875,3.5,2.5,4.951841,3.112297
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4,google-cbrs-cpi-training,0.7823,4.5646,0.651111,4.302222,4.45575,4.127778,4.664877,4.463109,4.462075,4.217252,5.0,5.0,4.997999,4.992577
5,Some of the quizzes contained material not exp...,By Daniel F,"Dec 23, 2019",4,google-cbrs-cpi-training,0.0,3.0,0.0,3.0,2.5,2.5,3.225,3.225,2.5,2.5,1.5,1.5,2.5,2.5
6,Solid presentation all the way through. I real...,By Logan D,"Sep 03, 2020",5,google-cbrs-cpi-training,0.8264,4.6528,0.135714,3.271429,4.566,2.839286,4.732703,3.530357,4.535615,2.904662,5.0,2.5,4.998712,3.976475
7,Probably the best certification course I've ta...,By Luis M C,"Nov 21, 2019",5,google-cbrs-cpi-training,0.8935,4.787,0.507143,4.014286,4.73375,3.767857,4.835903,4.241686,4.639354,3.903148,5.0,5.0,4.999342,4.968828
8,The ProctorU.com system took 2 times the amoun...,By scott w,"Sep 28, 2020",5,google-cbrs-cpi-training,-0.4767,2.0466,-0.133333,2.733333,1.30825,2.166667,1.8399,2.87,1.169209,2.102354,1.5,1.5,0.042171,1.043043
9,Covered all of the required information in an ...,By Ryan H,"Aug 26, 2019",5,google-cbrs-cpi-training,0.908,4.816,0.5125,4.025,4.77,3.78125,4.858204,4.249925,4.660513,3.915672,5.0,5.0,4.99943,4.970445


In [None]:
quantiles_v = np.percentile(df['vader_sentiment_polarity'],  [10, 33, 50, 67, 83])
quantiles_t = np.percentile(df['textblob_sentiment_polarity'],  [10, 33, 50, 67, 83])
df['vader_quantile_scaled_rating'] = df['vader_sentiment_polarity'].apply(lambda x: quantile_scale(x, quantiles_v))
df['textblob_quantile_scaled_rating'] = df['textblob_sentiment_polarity'].apply(lambda x: quantile_scale(x, quantiles_t))
# df.to_csv('updated_file_with_rating_quant.csv', index=False)

In [None]:
df['vader_sentiment_rating_sigmoid'] = df['vader_sentiment_polarity'].apply(sigmoid_scaling)
df['textblob_sentiment_rating_sigmoid'] = df['textblob_sentiment_polarity'].apply(sigmoid_scaling)
# df.to_csv('updated_file_with_rating_sigmoid.csv', index=False)

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame that contains 'sentiment_rating' and 'rating' columns
# Calculate average sentiment rating by course_id
average_vader_ratings = df.groupby('course_id')['vader_sentiment_rating'].mean().reset_index()
average_vader_ratings.columns = ['course_id', 'average_vader_sentiment_rating']

average_vader_ratings_linear = df.groupby('course_id')['vader_sentiment_rating_linear'].mean().reset_index()
average_vader_ratings_linear.columns = ['course_id', 'average_vader_sentiment_rating_linear']

average_vader_ratings_piecewise = df.groupby('course_id')['vader_sentiment_rating_piecewise'].mean().reset_index()
average_vader_ratings_piecewise.columns = ['course_id', 'average_vader_sentiment_rating_piecewise']

average_vader_ratings_exp = df.groupby('course_id')['vader_sentiment_rating_exp'].mean().reset_index()
average_vader_ratings_exp.columns = ['course_id', 'average_vader_sentiment_rating_exp']

average_vader_quantile_scaled_rating = df.groupby('course_id')['vader_quantile_scaled_rating'].mean().reset_index()
average_vader_quantile_scaled_rating.columns = ['course_id', 'average_vader_quantile_scaled_rating']

average_vader_ratings_sigmoid = df.groupby('course_id')['vader_sentiment_rating_sigmoid'].mean().reset_index()
average_vader_ratings_sigmoid.columns = ['course_id', 'average_vader_sentiment_rating_sigmoid']




average_textblob_ratings = df.groupby('course_id')['textblob_sentiment_rating'].mean().reset_index()
average_textblob_ratings.columns = ['course_id', 'average_textblob_sentiment_rating']

average_textblob_ratings_linear = df.groupby('course_id')['textblob_sentiment_rating_linear'].mean().reset_index()
average_textblob_ratings_linear.columns = ['course_id', 'average_textblob_sentiment_rating_linear']

average_textblob_ratings_piecewise = df.groupby('course_id')['textblob_sentiment_rating_piecewise'].mean().reset_index()
average_textblob_ratings_piecewise.columns = ['course_id', 'average_textblob_sentiment_rating_piecewise']

average_textblob_ratings_exp = df.groupby('course_id')['textblob_sentiment_rating_exp'].mean().reset_index()
average_textblob_ratings_exp.columns = ['course_id', 'average_textblob_sentiment_rating_exp']

average_textblob_quantile_scaled_rating = df.groupby('course_id')['textblob_quantile_scaled_rating'].mean().reset_index()
average_textblob_quantile_scaled_rating.columns = ['course_id', 'average_textblob_quantile_scaled_rating']

average_textblob_ratings_sigmoid = df.groupby('course_id')['textblob_sentiment_rating_sigmoid'].mean().reset_index()
average_textblob_ratings_sigmoid.columns = ['course_id', 'average_textblob_sentiment_rating_sigmoid']




# Calculate average actual rating by course_id
actual_rating = df.groupby('course_id')['rating'].mean().reset_index()
actual_rating.columns = ['course_id', 'average_actual_rating']


import pandas as pd
from functools import reduce

# Define all the DataFrames as you calculated them
dfs = [
    average_vader_ratings, average_vader_ratings_linear, average_vader_ratings_piecewise,
    average_vader_ratings_exp, average_vader_quantile_scaled_rating, average_vader_ratings_sigmoid,
    average_textblob_ratings, average_textblob_ratings_linear, average_textblob_ratings_piecewise,
    average_textblob_ratings_exp, average_textblob_quantile_scaled_rating, average_textblob_ratings_sigmoid,
    actual_rating
]

# Function to merge DataFrames in a list
def merge_dataframes(dfs, on, how='outer'):
    return reduce(lambda left, right: pd.merge(left, right, on=on, how=how), dfs)

# Merge all DataFrames on 'course_id'
combined_ratings = merge_dataframes(dfs, on='course_id', how='outer')

# Print the combined DataFrame to check
print("Combined Ratings:")
print(combined_ratings.head())

# Save the combined DataFrame to a CSV file
combined_ratings.to_csv('combined_ratings_by_course.csv', index=False)



Combined Ratings:
                              course_id  average_vader_sentiment_rating  \
0                  aboriginal-education                        4.312560   
1                   access-control-sscp                        3.854867   
2                  accounting-analytics                        4.034119   
3      accounting-data-analytics-python                        4.001650   
4  actualizacion-manejo-diabetes-tipo-2                        2.933641   

   average_vader_sentiment_rating_linear  \
0                               4.140701   
1                               3.568584   
2                               3.792649   
3                               3.752062   
4                               2.417052   

   average_vader_sentiment_rating_piecewise  \
0                                  4.426866   
1                                  4.046308   
2                                  4.169087   
3                                  4.097278   
4                              