# **WEB MINING PROJECT**

**Task 1: Using TF-IDF Vectors and Logistic Regression for Sentiment Classification**

**1. preprocessing dataset**

In [1]:
#importing necessary modules and packages
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
#we transform the dataset into a dictionary
dict_rev={
"R1":"The plot is predictable without excitement. I won't recommend it.",
"R2":"The plot is novel and the story is interesting.",
"R3":"The plot is fast-paced and the story is filled with excitement. I would recommend it.",
"R4":"The story is unconvincing and the plot is predictable."}
# positive reviews are represented by 1 and negative ones by 0
dict_labels={"R1":0, "R2":1, "R3":1, "R4":0}

In [3]:
class dict_preprocessing :
    cust_stopwords =["i","you", "he", "she", "it", "we", "they" , "is", 
                     "will", "are", "would", "wouldn't", "the", "and", "or", "with"]
    def __init__(self, dictionary):
        self.__dict_rev = dictionary

    def lowercasing(self): # lowercasing user reviews
        for ind,val in self.__dict_rev.items():
            self.__dict_rev[ind] = val.lower()
    
    def removing_punctuations(self):
        for ind,val in self.__dict_rev.items():  # removing punctuations "." and ","
            self.__dict_rev[ind] = val.replace("."," ").replace(","," ").strip()
    
    def removing_stopwords(self):
        for ind,val in self.__dict_rev.items(): # removing stopwords
            x= val.split()
            for i in dict_preprocessing.cust_stopwords:
                while i in x:
                    x.remove(i)
            self.__dict_rev[ind] = " ".join(x)
        
    def apply_preprocessing_pipeline(self): 
        self.lowercasing()
        self.removing_punctuations()
        self.removing_stopwords()
    @property
    def dict_rev(self):
        return self.__dict_rev

In [4]:
dict_obj = dict_preprocessing(dict_rev)
dict_obj.apply_preprocessing_pipeline()

dict_rev = dict_obj.dict_rev
print(dict_rev)

{'R1': "plot predictable without excitement won't recommend", 'R2': 'plot novel story interesting', 'R3': 'plot fast-paced story filled excitement recommend', 'R4': 'story unconvincing plot predictable'}


In [5]:
terms=["plot", "predictable", "without", "excitement", "won't", "recommend",
        "novel", "story", "interesting", "fast-paced", "filled", "unconvincing"]
reviews=["R1", "R2", "R3", "R4"]

In [6]:
#calculate tf_idf
class tf_idf:
    def __init__(self, terms ,reviews, dictionary, transformed = None):
        self.terms =terms
        self.reviews = reviews
        self.__tf_idf_table = pd.DataFrame(0.0, index = terms, columns = reviews)
        self.dict_rev = dictionary
        self.tansformed = transformed
        
    def calculate(self, test=False):
        if not test:
            N= len(self.dict_rev) #number of reviews N
            freq= np.array([len(val.split()) for val in self.dict_rev.values()],dtype=np.float32) # the total number of words in each review
            for lab, data in self.__tf_idf_table.iterrows() :
                df= sum([1 for val in self.dict_rev.values() if lab in val.split()])
                tf= np.array([val.count(lab) for val in self.dict_rev.values()], dtype =np.float32)
                tf/= freq
                log10_tf = np.where(tf >0 , np.log10(tf), 0)
                self.__tf_idf_table.loc[[lab], :] = ((1+log10_tf)*(np.log10(N/df)))
        else :
            N= len(self.dict_rev) #number of reviews N
            freq= np.array([len(val.split()) for val in self.dict_rev.values()],dtype=np.float32) # the total number of words in each review
            for lab, data in self.__tf_idf_table.iterrows() :
                log_df= transformed[lab]
                tf= np.array([val.count(lab) for val in self.dict_rev.values()], dtype =np.float32)
                tf/= freq
                log10_tf = np.where(tf >0 , np.log10(tf), 0)
                self.__tf_idf_table.loc[[lab], :] = (1+log10_tf)*(log_df)
    @property
    def tf_idf_table(self):
        return self.__tf_idf_table
    
    def transform_idf(self): #for using it on the test dataset
        N= len(self.dict_rev) #number of reviews N
        save_idf =dict()
        for lab, data in self.__tf_idf_table.iterrows() :
            df= sum([1 for val in self.dict_rev.values() if lab in val.split()])
            save_idf[lab]=np.log10(N/df)
        return save_idf


In [7]:
tr_obj = tf_idf(terms,reviews, dict_rev)
tr_obj.calculate()
tf_idf_table = tr_obj.tf_idf_table
print(tf_idf_table)

                    R1        R2        R3        R4
plot          0.000000  0.000000  0.000000  0.000000
predictable   0.066783  0.301030  0.301030  0.119792
without       0.133566  0.602060  0.602060  0.602060
excitement    0.066783  0.301030  0.066783  0.301030
won't         0.133566  0.602060  0.602060  0.602060
recommend     0.066783  0.301030  0.066783  0.301030
novel         0.602060  0.239584  0.602060  0.602060
story         0.124939  0.049718  0.027718  0.049718
interesting   0.602060  0.239584  0.602060  0.602060
fast-paced    0.602060  0.602060  0.133566  0.602060
filled        0.602060  0.602060  0.133566  0.602060
unconvincing  0.602060  0.602060  0.602060  0.239584


**2. making the binary classification for the user reviews**

In [8]:
#Logistc regression 
class log_reg:
    def __init__(self, data, w, rev_lab, b=0):
        self.x = data
        self.w = w
        self.b = b
        self.reviews_labels = rev_lab

    def z_calc(self):
        return np.dot(self.w.T, self.x)+ self.b

    def logistic_reg(self):
        z= self.z_calc()
        return (1/(1+np.exp(-z)))

    def compute_pred(self):
        a = self.logistic_reg().squeeze()
        return (a>0.5).astype(np.int32).reshape(1,-1)

    def table(self):
        x = {"Review ID":self.reviews_labels,"Sigmoid Score (Pred Score)": self.logistic_reg().squeeze(),
             "Pred Label": self.compute_pred().squeeze()}
        tab = pd.DataFrame(x)
        return tab

    def bin_cross_error(self,labels_dict):
        h=self.logistic_reg()
        labels = np.array(list(labels_dict.values()), dtype = np.float32).reshape(1,-1)
        return np.mean(-(labels*np.log(h)+(1-labels)*np.log(1-h)))

In [9]:
w=np.array([0.5,-0.2,-0.3,1.5,-0.3,1.1,0.9,0.8,1.3,1.7,0.1,-0.4]).reshape(-1,1)
x= tf_idf_table.to_numpy()
log_obj = log_reg(x,w,["R1", "R2", "R3", "R4"])
pred = log_obj.table()
print(pred)

  Review ID  Sigmoid Score (Pred Score)  Pred Label
0        R1                    0.912736           1
1        R2                    0.854579           1
2        R3                    0.749978           1
3        R4                    0.939894           1


**3. compute the average cross-entropy loss**

In [10]:
log_obj.bin_cross_error(dict_labels)

1.4238313069116355

In [11]:
dict_new_rev={"R5": "The story is filled with excitement,I will recommend it."}
dict_new_label={"R5": 1}

In [12]:
dict_obj = dict_preprocessing(dict_new_rev)
dict_obj.apply_preprocessing_pipeline()
dict_new_rev = dict_obj.dict_rev
print(dict_new_rev)

{'R5': 'story filled excitement recommend'}


In [13]:
transformed = tr_obj.transform_idf()
t_obj = tf_idf(terms, ["R5"], dict_new_rev, transformed)
t_obj.calculate(True)
test_tf_idf_table = t_obj.tf_idf_table
print(test_tf_idf_table)

                    R5
plot          0.000000
predictable   0.301030
without       0.602060
excitement    0.119792
won't         0.602060
recommend     0.119792
novel         0.602060
story         0.049718
interesting   0.602060
fast-paced    0.602060
filled        0.239584
unconvincing  0.602060


In [14]:
w=np.array([0.5,-0.2,-0.3,1.5,-0.3,1.1,0.9,0.8,1.3,1.7,0.1,-0.4]).reshape(-1,1)
x= test_tf_idf_table.to_numpy()
test_log_obj = log_reg(x,w,["R5"])
pred = test_log_obj.table()
print(pred)

  Review ID  Sigmoid Score (Pred Score)  Pred Label
0        R5                     0.88705           1


**Task 2: Using Dense Vector Representation and Logistic Regression for Sentiment Classification**

In [15]:
d1 = [0.58, -0.48, 0.65, -1.35, 1.85] 
d2 = [-0.61, 1.97, -1.06, 0.34, -0.37] 
d3 = [-1.46, 0.18, 0.07, 1.07, 1.74] 
d4 = [-1.64, -1.22, 1.98, -1.06, -1.04] 
w = np.array([-0.3, 1.8, 0.9, 1.5, -0.1]).reshape(-1,1)
x = np.array([d1, d2, d3, d4]).T
dense_log_obj = log_reg(x,w,["R1", "R2", "R3", "R4"])
pred = dense_log_obj.table()
print(pred)

  Review ID  Sigmoid Score (Pred Score)  Pred Label
0        R1                    0.065192           0
1        R2                    0.965176           1
2        R3                    0.905167           1
3        R4                    0.196550           0


In [16]:
dense_log_obj.bin_cross_error(dict_labels)

0.10533382664807818

In [17]:
d1 = [0.52, 0.94, 0.75, -1.88,1.61] 
w = np.array([-0.3, 1.8, 0.9, 1.5, -0.1]).reshape(-1,1)
x = np.array([d1]).T
dense_log_obj = log_reg(x,w,["R1"])
pred = dense_log_obj.table()
print(pred)

  Review ID  Sigmoid Score (Pred Score)  Pred Label
0        R1                    0.316479           0


**The differences between using tf-idf representation and dense vector representation**


the intrinsic difference between using tf-idf and dense vectors is that tf-idf assigns weights to words based on their frequency in the documents and rarety in the corpus.
By considering both local and global word importance, tf-idf is useful for tasks like information retrieval and document clustring. However, it's not well-suited for tasks like sentiment analysis since it does not capture semantic relationships between words, unlike dense vectors that consider the context and meaning of words. 
Those conclusions are drawn from the project's results on both tf_idf and dense vectors, since the dense vectors' average binary cross-entropy is significantly smaller than tf_idf's one