In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [2]:
train_data = pd.read_csv("C:/Users/TIRTH JOSHI/Desktop/Sigmoid ML/dataset/train.csv")

In [3]:
output_data = pd.read_csv("C:/Users/TIRTH JOSHI/Desktop/Sigmoid ML/dataset/test.csv")

In [4]:
emotions = train_data.columns.values[2:] 

In [5]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

In [6]:
def report_results(model, X, y):
    pred = model.predict(X)        
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [7]:
svc__C = []
for i in range(0,20,1):
    svc__C.append(i/100)

In [11]:
for emotion in emotions:
    temp = train_data[["Tweet" , emotion]]
    emotion_data = temp.copy()
    train, test = train_test_split(emotion_data, test_size=0.1, random_state=1)
    X_train = train['Tweet'].values
    X_test = test['Tweet'].values
    y_train = train[emotion]
    y_test = test[emotion]
    vectorizer = CountVectorizer(
        analyzer = 'word',
        tokenizer = tokenize,
        lowercase = True,
        ngram_range=(1, 1),
        stop_words = en_stopwords)
    kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    np.random.seed(1)
    pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear"))
    grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {' ': svc__C}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 
    print(f"Started Training {emotion}")
    print(grid_svm.get_params().keys())
    grid_svm.fit(X_train, y_train)
    print(f"Completed Training {emotion}")
    print(f"Score of {emotion} = {grid_svm.score(X_test, y_test)}")
    print(f"grid_svm.best_params_ of {emotion} = {grid_svm.best_params_}")
    print(f"grid_svm.best_score_ of {emotion} = {grid_svm.best_score_}")
    print(f"For {emotion}")
    print(report_results(grid_svm.best_estimator_, X_test, y_test))
    emotion_coloum = []
    for tweet in output_data["Tweet"]:
        emotion_coloum.append(grid_svm.predict([tweet])[0])
    output_data[emotion] = emotion_coloum
    

Started Training anger
dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__countvectorizer', 'estimator__svc', 'estimator__countvectorizer__analyzer', 'estimator__countvectorizer__binary', 'estimator__countvectorizer__decode_error', 'estimator__countvectorizer__dtype', 'estimator__countvectorizer__encoding', 'estimator__countvectorizer__input', 'estimator__countvectorizer__lowercase', 'estimator__countvectorizer__max_df', 'estimator__countvectorizer__max_features', 'estimator__countvectorizer__min_df', 'estimator__countvectorizer__ngram_range', 'estimator__countvectorizer__preprocessor', 'estimator__countvectorizer__stop_words', 'estimator__countvectorizer__strip_accents', 'estimator__countvectorizer__token_pattern', 'estimator__countvectorizer__tokenizer', 'estimator__countvectorizer__vocabulary', 'estimator__svc__C', 'estimator__svc__break_ties', 'estimator__svc__cache_size', 'estimator__svc__class_weight', 'estimator__svc__coef0

ValueError: Invalid parameter   for estimator Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...},
                                 tokenizer=<function tokenize at 0x000001A16BCAF0D0>)),
                ('svc', SVC(kernel='linear', probability=True))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
output_data.columns.values

In [None]:
from pathlib import Path  
filepath = Path('C:/Users/TIRTH JOSHI/Desktop/Sigmoid ML/dataset/output.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
output_data.to_csv(filepath)

In [None]:
data = pd.read_csv("C:/Users/TIRTH JOSHI/Desktop/Sigmoid ML/dataset/output.csv")

In [None]:
subm = output_data[output_data.columns.values]

In [None]:
subm

In [None]:
from pathlib import Path  
filepath = Path('C:/Users/TIRTH JOSHI/Desktop/Sigmoid ML/dataset/output.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
subm.to_csv(filepath)

In [None]:
# temp = data[["Tweet" , "anger"]]

In [None]:
# anger_data = temp.copy()

In [None]:
# anger_data

In [None]:
# train, test = train_test_split(anger_data, test_size=0.2, random_state=1)
# X_train = train['Tweet'].values
# X_test = test['Tweet'].values
# y_train = train['anger']
# y_test = test['anger']

In [None]:
# def tokenize(text): 
#     tknzr = TweetTokenizer()
#     return tknzr.tokenize(text)

# def stem(doc):
#     return (stemmer.stem(w) for w in analyzer(doc))

# en_stopwords = set(stopwords.words("english")) 

In [None]:
# vectorizer = CountVectorizer(
#     analyzer = 'word',
#     tokenizer = tokenize,
#     lowercase = True,
#     ngram_range=(1, 1),
#     stop_words = en_stopwords)

In [None]:
# kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
# np.random.seed(1)

In [None]:
# pipeline_svm = make_pipeline(vectorizer, 
#                              SVC(probability=True, kernel="linear", class_weight="balanced"))

In [None]:
# grid_svm = GridSearchCV(pipeline_svm,
#                     param_grid = {'svc__C': [0.01, 0.1, 1]}, 
#                     cv = kfolds,
#                     scoring="roc_auc",
#                     verbose=1,   
#                     n_jobs=-1) 

In [None]:
# grid_svm.fit(X_train, y_train)
# grid_svm.score(X_test, y_test)

In [None]:
# grid_svm.best_params_

In [None]:
# grid_svm.best_score_

In [None]:
# def report_results(model, X, y):
#     pred = model.predict(X)        
#     acc = accuracy_score(y, pred)
#     f1 = f1_score(y, pred)
#     prec = precision_score(y, pred)
#     rec = recall_score(y, pred)
#     result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
#     return result

In [None]:
# def report_results(model, X, y):
#     pred_proba = model.predict_proba(X)[:, 1]
#     pred = model.predict(X)        

#     auc = roc_auc_score(y, pred_proba)
#     acc = accuracy_score(y, pred)
#     f1 = f1_score(y, pred)
#     prec = precision_score(y, pred)
#     rec = recall_score(y, pred)
#     result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
#     return result

In [None]:
# report_results(grid_svm.best_estimator_, X_test, y_test)

In [None]:
estimator.get_params().keys()