In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv('/content/drive/Shared drives/Sarcasm Detection/Malayalam/sarcasm_mal_train (1).csv',usecols=['Text','labels'])
df['labels']=df['labels'].replace({'Non-sarcastic':0,'Sarcastic':1})
df.dropna()
df

Unnamed: 0,Text,labels
0,Screenshot edukkan vannth njan മാത്രമാണോ,1
1,നമ്മുടെ അനു സിത്താര ചേച്ചി ഇങ്ങനെ വരുന്നത് നോക...,1
2,Mollyhood is getting bigger and bigger,0
3,Ho aaa BGM. Mammookka ithu oru pwoli pwolikkum,0
4,"Enthaale, sambhavam puraanam aanelum backgroun...",1
...,...,...
13183,Madhu C Narayanan .... പ്രതീക്ഷ ഉള്ള ഒരു സംവിധ...,0
13184,🤣🤣🤣🤣🤣 kya mazak hai... Hans hans k lotpot ho j...,0
13185,channel aaanu ishtapettal subscribe cheyyumo plz,0
13186,Nte ponno... kidilam... marana waiting,0


# Pre Processing

Stopwords Removal

In [None]:
X=df['Text']
y=df['labels']

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def remove_numeric_values(text):
    return re.sub(r'\d+', '', text)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=32)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

TF-IDF Vectorizer


In [None]:
# Step 1: Install necessary libraries
!pip install scikit-learn

# Step 2: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

df['Preprocessed_Text'] = df['Text'].apply(preprocess)
df['Preprocessed_Text'] = df['Text'].apply(remove_numeric_values)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, token_pattern=r'\b\w+\b')

# Apply TF-IDF vectorization on preprocessed text
X = vectorizer.fit_transform(df['Preprocessed_Text'])
y = df['labels']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Upsampling

In [None]:
print(y_train.value_counts())

labels
0    8547
1    2003
Name: count, dtype: int64


In [None]:
!pip install imbalanced-learn



In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state = 42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(y_train_res.value_counts())

labels
0    8547
1    8547
Name: count, dtype: int64


Hyperparameter tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
parameters = {'alpha': [0.1, 0.5, 1.0]}
grid_search = GridSearchCV(MultinomialNB(), parameters, cv=5)
grid_search.fit(X_train_res, y_train_res)
best_params = grid_search.best_params_
print(best_params)

{'alpha': 0.1}


In [None]:
clf = MultinomialNB(alpha=best_params['alpha'])
clf.fit(X_train_res, y_train_res)

In [None]:
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78      2142
           1       0.34      0.67      0.45       496

    accuracy                           0.69      2638
   macro avg       0.62      0.68      0.62      2638
weighted avg       0.79      0.69      0.72      2638



# Models

SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

In [None]:
# Initialize GridSearchCV with the SVM model
grid_search = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=2, cv=5)

# Perform grid search on the training data
grid_search.fit(X_train_res, y_train_res)

# Get the best parameters from grid search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  30.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  27.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  34.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  42.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  34.8s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  28.9s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  38.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  32.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  36.5s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  26.2s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=  33.2s
[CV] END .......................C=0.1, gamma=0.

In [None]:
best_svm = svm.SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'])
best_svm.fit(X_train_res, y_train_res)

In [None]:
predictions = best_svm.predict(X_test)
print(classification_report(y_test, predictions))

Logisitc Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],  # L1 is Lasso, L2 is Ridge
    'solver': ['liblinear', 'saga']  # Solvers that support L1 penalty
}

In [None]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=2, cv=5)

In [None]:
grid_search.fit(X_train_res, y_train_res)

In [None]:
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

In [None]:
best_lr = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], solver=best_params['solver'])
best_lr.fit(X_train_res, y_train_res)

In [None]:
predictions = best_lr.predict(X_test)
print("Evaluation on TF-IDF features:")
print(classification_report(y_test, predictions))

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

In [None]:
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, refit=True, verbose=2, cv=5)

In [None]:
grid_search_knn.fit(X_train_res, y_train_res)

In [None]:
best_params_knn = grid_search_knn.best_params_
print("Best parameters found: ", best_params_knn)

In [None]:
best_knn = KNeighborsClassifier(**best_params_knn)
best_knn.fit(X_train_res, y_train_res)

In [None]:
predictions = best_knn.predict(X_test)
print("Evaluation on TF-IDF features:")
print(classification_report(y_test, predictions))

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_res,y_train_res)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2142
           1       0.45      0.40      0.43       496

    accuracy                           0.80      2638
   macro avg       0.66      0.65      0.65      2638
weighted avg       0.79      0.80      0.79      2638



Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train_res,y_train_res)

In [None]:
y_dectree = dt_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_dectree))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83      2142
           1       0.33      0.38      0.35       496

    accuracy                           0.74      2638
   macro avg       0.59      0.60      0.59      2638
weighted avg       0.75      0.74      0.74      2638



Ensemble - All 5

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
svm_clf = SVC(kernel='linear', C=1.0, probability=True)
dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
log_reg_clf = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
ensemble_clf = VotingClassifier(estimators=[
    ('knn', knn_clf),
    ('svm', svm_clf),
    ('dt', dt_clf),
    ('rf', rf_clf),
    ('log_reg', log_reg_clf)
], voting='soft')

In [None]:
ensemble_clf.fit(X_train_res,y_train_res)

In [None]:
ensemble_pred = ensemble_clf.predict(X_test)

In [None]:
print(classification_report(y_test, ensemble_pred))

              precision    recall  f1-score   support

           0       0.90      0.78      0.84      2142
           1       0.40      0.64      0.49       496

    accuracy                           0.75      2638
   macro avg       0.65      0.71      0.66      2638
weighted avg       0.81      0.75      0.77      2638



# Result


In [None]:
df2 = pd.read_csv('/content/drive/Shared drives/Sarcasm Detection/Malayalam/sarcasm_mal_test_without_labels.csv',usecols=['Text'])
df2

Unnamed: 0,Text
0,Shavakallarayile Kuzhimaadathile Peril Oru Let...
1,ഗീതു മോഹൻദാസ് മലയാള സിനിമക്കു നൽകുന്ന വമ്പൻ ഗി...
2,Ente ponno ah sound🥰🥰 poli poli🤘
3,Villain sharafudheen ennu thonnunnavar likikk...
4,pulimurukan trailer ano kanunath 🤔
...,...
2821,Ente ponno oru adaaru jagapoka aanenu manasila...
2822,എന്റെ ഇക്ക nja നമിച്ചു... ഒരു രക്ഷയില്ല ഹെവി ഐ...
2823,ദേ ഇപ്പൊ കണ്ട് ഇറങ്ങിയതേ ഉള്ളു 96 Karikku (+...
2824,1) Drisyam 2) Memories 3) Seconds 4) Grand ma...


In [None]:
X=df2['Text']
X=X.apply(preprocess)
X = vectorizer.transform(X)
print(X.shape)

(2826, 2847)


SVM Prediction

In [None]:
svm_test_pred=svm_model.predict(X)

Logisitic Regression Prediction

In [None]:
lr_test_pred= lr1.predict(X)
print(lr_test_pred.shape)

(2826,)


KNN Prediction

In [None]:
knn_test_pred= knn.predict(X)

Random Forest Prediction

In [None]:
print(X.shape)
rf_test_pred=rf_model.predict(X)

(2826, 2847)


Decision Tree Prediction

In [None]:
dt_test_pred= dt_model.predict(X)

Ensemble Prediction

In [None]:
ensemble_test_pred=ensemble_clf.predict(X)
ensemble_test_pred

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
ensemble_test_pred_df = pd.DataFrame(ensemble_test_pred, columns = ['labels'])
ensemble_test_pred_df['labels']=ensemble_test_pred_df['labels'].replace({0:'Non-sarcastic',1:'Sarcastic'})
ensemble_test_pred_df

Unnamed: 0,labels
0,Non-sarcastic
1,Non-sarcastic
2,Non-sarcastic
3,Non-sarcastic
4,Non-sarcastic
...,...
2821,Non-sarcastic
2822,Non-sarcastic
2823,Sarcastic
2824,Non-sarcastic


In [None]:
ensemble_test_pred_df.to_csv('/content/drive/Shared drives/Sarcasm Detection/Malayalam/output_malayalam_final.csv', sep=",", index=False, header=None)