META STACKING

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

In [3]:
file_path = '/content/sarcasm_tam_train (1).csv'

try:
    # Load CSV file and select specific columns
    df = pd.read_csv(file_path, usecols=['Text', 'labels'])

    # Drop rows with NaN values
    df.dropna(inplace=True)

    # Display the DataFrame to verify changes
    print(df.head())

except Exception as e:
    print(f"Error: {e}")


                                                Text         labels
0  அர்யவுக்கு  ஒரு நல்ல வாய்ப்பு சிங்கம் சூரியா அ...  Non-sarcastic
1  பள்ளியோ கல்லூரியோ படித்துக்கொண்டிருக்கும்போது ...  Non-sarcastic
2  தல தல தல தல தல தல தல தல தல தல தல தல தல தல தல த...  Non-sarcastic
3  All the best to the team....super ah na Oru ka...  Non-sarcastic
4  Bahut over mack up kiya hua hai, it is look li...  Non-sarcastic


In [4]:
df['labels']=df['labels'].replace({'Non-sarcastic':0,'Sarcastic':1})

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

TF-IDF VECTOR

In [8]:
# Step 1: Install necessary libraries
!pip install scikit-learn

# Step 2: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

df['Preprocessed_Text'] = df['Text'].apply(preprocess)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, token_pattern=r'\b\w+\b')

# Apply TF-IDF vectorization on preprocessed text
X = vectorizer.fit_transform(df['Preprocessed_Text'])
y = df['labels']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



UPSAMPLING

In [9]:
print(y_train.value_counts())

labels
0    17422
1     6234
Name: count, dtype: int64


In [10]:
!pip install imbalanced-learn



In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
sm = SMOTE(random_state = 42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(y_train_res.value_counts())

labels
0    17422
1    17422
Name: count, dtype: int64


SVM

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [14]:
svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train_res,y_train_res)

In [15]:
y_svm = svm_model.predict(X_test)

In [16]:
print(classification_report(y_test, y_svm))

              precision    recall  f1-score   support

           0       0.88      0.78      0.83      4318
           1       0.54      0.70      0.61      1596

    accuracy                           0.76      5914
   macro avg       0.71      0.74      0.72      5914
weighted avg       0.79      0.76      0.77      5914



Logisitc Regression

In [17]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions = lr1.predict(X_test)

In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.77      0.82      4318
           1       0.53      0.72      0.61      1596

    accuracy                           0.75      5914
   macro avg       0.71      0.74      0.72      5914
weighted avg       0.79      0.75      0.76      5914



KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train_res,y_train_res)
predictions_knn= knn.predict(X_test)

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions_knn))

              precision    recall  f1-score   support

           0       0.92      0.40      0.56      4318
           1       0.36      0.91      0.51      1596

    accuracy                           0.54      5914
   macro avg       0.64      0.65      0.54      5914
weighted avg       0.77      0.54      0.55      5914



Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_res,y_train_res)

In [23]:
y_pred = rf_model.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      4318
           1       0.56      0.53      0.55      1596

    accuracy                           0.76      5914
   macro avg       0.70      0.69      0.69      5914
weighted avg       0.76      0.76      0.76      5914



Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train_res,y_train_res)

In [27]:
y_dectree = dt_model.predict(X_test)

In [28]:
print(classification_report(y_test, y_dectree))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      4318
           1       0.48      0.50      0.49      1596

    accuracy                           0.72      5914
   macro avg       0.64      0.65      0.65      5914
weighted avg       0.72      0.72      0.72      5914



Stacking - Meta Ensemble

In [29]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [30]:
base_estimators = [
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier()),
    ('svm', SVC(probability=True)),
    ('knn', KNeighborsClassifier())
]

meta_learner = LogisticRegression()
stacking_clf = StackingClassifier(estimators=base_estimators, final_estimator=meta_learner)

In [31]:
stacking_clf.fit(X_train_res,y_train_res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [32]:
stack_pred = stacking_clf.predict(X_test)
print(classification_report(y_test, stack_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4318
           1       0.63      0.54      0.58      1596

    accuracy                           0.79      5914
   macro avg       0.74      0.71      0.72      5914
weighted avg       0.78      0.79      0.79      5914



Validate

In [46]:
df2 = pd.read_csv('/content/sarcasm_tam_test_without_labels.csv',usecols=['Text'])
df2

Unnamed: 0,Text
0,Kangana wow awesome yr ye lakdi sbae alh hai
1,விழுப்புரம் வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...
2,திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...
3,"இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ..."
4,dai thala pera sonnalay summa tamil naday athi...
...,...
6333,NTR _ Ajith mutuals like here
6334,aiyo #thala marana mass #thala love you so muc...
6335,Yan kadavula I love you thala
6336,Thank you vijay sethupathi....for acted at syr...


In [34]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [47]:
X=df2['Text']
X=X.apply(preprocess)
X = vectorizer.transform(X)
print(X.shape)

(6338, 4917)


SVM Prediction

In [48]:
svm_test_pred=svm_model.predict(X)

Logisitic Regression Prediction

In [50]:
lr_test_pred= lr1.predict(X)
print(lr_test_pred.shape)

(6338,)


KNN Prediction

In [51]:
knn_test_pred= knn.predict(X)

Random Forest Prediction

In [52]:
print(X.shape)
rf_test_pred=rf_model.predict(X)

(6338, 4917)


Decision Tree Prediction

In [53]:
dt_test_pred= dt_model.predict(X)

Stacking

In [54]:
stack_test_pred=stacking_clf.predict(X)

In [55]:
reverse_key={0:'Non-sarcastic',1:'Sarcastic'}

In [56]:
predicted_labels = [reverse_key[label] for label in lr_test_pred]

In [61]:
X_main=df2['Text']
results_df = pd.DataFrame({'Text': X_main, 'labels': predicted_labels})
print(results_df)
results_df.to_csv('/content/output_tam_final.csv', sep=",", index=False, header=None)

                                                   Text         labels
ID                                                                    
0         Kangana wow  awesome yr ye lakdi sbae alh hai  Non-sarcastic
1     விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக...  Non-sarcastic
2     திரௌபதி திரைப்படம் வெற்றி பெற வாணியர் சமுதாயம்...  Non-sarcastic
3     இந்த திரைப்படம் வெற்றிபெற, ஆதி தமிழன் அதாவது இ...  Non-sarcastic
4     dai thala pera sonnalay summa tamil naday athi...  Non-sarcastic
...                                                 ...            ...
6333                      NTR _ Ajith mutuals like here      Sarcastic
6334  aiyo #thala marana mass #thala love you so muc...      Sarcastic
6335                      Yan kadavula I love you thala      Sarcastic
6336  Thank you vijay sethupathi....for acted at syr...  Non-sarcastic
6337    Amitab and taapsi manu ki copy picture bnai h y  Non-sarcastic

[6338 rows x 2 columns]
