In [1]:
import pandas as pd
file_path = "data/noemoticon_preprocessed.csv"
df = pd.read_csv(file_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1597267 entries, 0 to 1597266
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1597267 non-null  int64 
 1   text      1597267 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB
None


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from textpreprocesshelper import TextPreprocessHelper, NumberProcessor
helper  = TextPreprocessHelper()

In [3]:
def getBestResult(remove_punctuation = True, number = NumberProcessor.NoAction) :

    df["tokens"] = df["text"].apply(word_tokenize)
    print("tokenize done")

    #after tokenize do extra actions
    if remove_punctuation : 
        df["tokens"] = df["tokens"].apply(helper.remove_punctuation)
        
    if number == NumberProcessor.Remove :
        df["tokens"] = df["tokens"].apply(helper.remove_numbers)
    elif number == NumberProcessor.ToString :
        df["tokens"] = df["tokens"].apply(helper.replace_numbers)

    print("additional preprocess done")

     # 训练/测试数据集划分
    train_x, test_x, train_y, test_y = train_test_split(
        df['tokens'].apply(lambda x: ' '.join(x)),  # word_tokenize 版本
        df['polarity'],
        random_state=34,
        stratify=df['polarity']
    )

    pipeline = Pipeline([ # 构建 Pipeline
        ('tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')),  # TF-IDF 向量化
        ('clf', MultinomialNB())  # 分类器，默认 Naive Bayes
    ])

    '''
    (1,1) (Unigram)	['I', 'love', 'NLP']	基础 NLP 任务，如情感分析
    (1,2) (Unigram + Bigram)	['I', 'love', 'NLP', 'I love', 'love NLP']	适合上下文敏感的 NLP 任务
    (2,2) (Bigram only)	['I love', 'love NLP']	更关注短语，但可能信息量不足
    (1,3) (Unigram + Bigram + Trigram)	['I', 'love', 'NLP', 'I love', 'love NLP', 'I love NLP']	适用于复杂文本建模
    '''
    # 定义 GridSearch 参数
    parameters = {
        'tfidf__ngram_range': [(1,1), (1,2)],  # 1-gram 和 1,2-gram
        'tfidf__lowercase': [True, False], 
        'tfidf__max_features': [20000, 50000, 100000],  # 词汇表大小 the value based on check_low_high_frequence_words in 2-preprocessing-analysis.ipynb
        'tfidf__stop_words': [None, 'english'],  # 是否使用停用词
        'clf': [MultinomialNB()]
    }

    # 执行 Grid Search
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
    grid_search.fit(train_x, train_y)

    # 打印最佳分数和参数
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # 预测测试集
    predictions = grid_search.best_estimator_.predict(test_x)

    label_mapping = {0: "Negative", 4: "Positive"}
    test_y_labels = test_y.map(label_mapping)
    predictions_labels = pd.Series(predictions).map(label_mapping)

    # 评估模型性能
    print("Accuracy:", metrics.accuracy_score(test_y_labels, predictions_labels))
    print("Precision:", metrics.precision_score(test_y_labels, predictions_labels, average='macro'))
    print("Recall:", metrics.recall_score(test_y_labels, predictions_labels, average='macro'))
    print("F1 Score:", metrics.f1_score(test_y_labels, predictions_labels, average='macro'))

In [4]:
getBestResult(remove_punctuation = True, number = NumberProcessor.ToString)

tokenize done
additional preprocess done
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best score: 0.801
Best parameters set:
	clf: MultinomialNB()
	tfidf__lowercase: True
	tfidf__max_features: 100000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8023475083705427
Precision: 0.8023979475993759
Recall: 0.8023484922810957
F1 Score: 0.8023396176160206


In [5]:
getBestResult(remove_punctuation = True, number = NumberProcessor.Remove)

tokenize done
additional preprocess done
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best score: 0.802
Best parameters set:
	clf: MultinomialNB()
	tfidf__lowercase: True
	tfidf__max_features: 100000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8025478504546513
Precision: 0.8026002992250363
Recall: 0.8025488534976455
F1 Score: 0.8025396521534354


In [6]:
getBestResult(remove_punctuation = True, number = NumberProcessor.NoAction)

tokenize done
additional preprocess done
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best score: 0.801
Best parameters set:
	clf: MultinomialNB()
	tfidf__lowercase: True
	tfidf__max_features: 100000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.802440166584443
Precision: 0.8024945704322033
Recall: 0.8024411883701349
F1 Score: 0.8024316483978


## no big differences among NoAction, Remove, and to string.

## decided to REMOVE to improve generalization, dimensionality reduction

In [7]:
getBestResult(remove_punctuation = False, number = NumberProcessor.Remove)

tokenize done
additional preprocess done
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best score: 0.801
Best parameters set:
	clf: MultinomialNB()
	tfidf__lowercase: True
	tfidf__max_features: 100000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8020194482078148
Precision: 0.8020944350267605
Recall: 0.8020206490195787
F1 Score: 0.8020075924342476


###Based on observiation, should remove punctuation

In [8]:
import pandas as pd
file_path = "data/noemoticon_with_header.csv"
df = pd.read_csv(file_path)
print(df.info())

def getLogisticRegressionBestResult(maxIter = 100, vocab_size=100000) :
    df["tokens"] = df["text"].apply(word_tokenize)
    df["tokens"] = df["tokens"].apply(helper.remove_punctuation)
    df["tokens"] = df["tokens"].apply(helper.remove_numbers)

     # 训练/测试数据集划分
    train_x, test_x, train_y, test_y = train_test_split(
        df['tokens'].apply(lambda x: ' '.join(x)),  # word_tokenize 版本
        df['polarity'],
        random_state=34,
        stratify=df['polarity']
    )

    pipeline = Pipeline([ # 构建 Pipeline
        ('tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')),  # TF-IDF 向量化
        ('clf', LogisticRegression(max_iter= maxIter))  # 分类器，默认 Naive Bayes
    ])

    # 定义 GridSearch 参数
    parameters = {
        'tfidf__ngram_range': [(1,2)],  # 1-gram 和 1,2-gram
        'tfidf__lowercase': [True], 
        'tfidf__max_features': [vocab_size],  # 词汇表大小
        'tfidf__stop_words': [None],  # 是否使用停用词
        'clf__C': [0.1, 1.0, 10.0],
        'clf': [LogisticRegression(max_iter= maxIter)]

    }

    # 执行 Grid Search
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
    grid_search.fit(train_x, train_y)

    # 打印最佳分数和参数
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # 预测测试集
    predictions = grid_search.best_estimator_.predict(test_x)

    label_mapping = {0: "Negative", 4: "Positive"}
    test_y_labels = test_y.map(label_mapping)
    predictions_labels = pd.Series(predictions).map(label_mapping)

    # 评估模型性能
    print("Accuracy:", metrics.accuracy_score(test_y_labels, predictions_labels))
    print("Precision:", metrics.precision_score(test_y_labels, predictions_labels, average='macro'))
    print("Recall:", metrics.recall_score(test_y_labels, predictions_labels, average='macro'))
    print("F1 Score:", metrics.f1_score(test_y_labels, predictions_labels, average='macro'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1599999 non-null  int64 
 1   text      1599999 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB
None


In [9]:
getLogisticRegressionBestResult(maxIter=100)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best score: 0.820
Best parameters set:
	clf: LogisticRegression()
	clf__C: 1.0
	tfidf__lowercase: True
	tfidf__max_features: 100000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8228325
Precision: 0.8229760878128611
Recall: 0.8228325
F1 Score: 0.822812806652319


In [10]:
getLogisticRegressionBestResult(maxIter=100, vocab_size=50000)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best score: 0.818
Best parameters set:
	clf: LogisticRegression()
	clf__C: 1.0
	tfidf__lowercase: True
	tfidf__max_features: 50000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8205375
Precision: 0.820684429275716
Recall: 0.8205374999999999
F1 Score: 0.8205169413811533


In [11]:
getLogisticRegressionBestResult(maxIter=100, vocab_size=30000)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best score: 0.816
Best parameters set:
	clf: LogisticRegression()
	clf__C: 1.0
	tfidf__lowercase: True
	tfidf__max_features: 30000
	tfidf__ngram_range: (1, 2)
	tfidf__stop_words: None
Accuracy: 0.8169775
Precision: 0.8171091448886023
Recall: 0.8169775
F1 Score: 0.8169585030096347
