In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from make_dataset import load_data
from build_features import get_features

In [11]:
def RB_classifier(filepath, svc_params, rf_params):
    """model for the first task

    Args:
        filepath (str): filepath of the raw data file

    Returns:
        dict: dictionary of models' performance
    """
    data = load_data(filepath)
    X, y = get_features(data, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = [SVC(**svc_params), 
              RandomForestClassifier(**rf_params)]

    preprocessor = ColumnTransformer(
        transformers=[
            ("bog", CountVectorizer(), 'cleaned_text'),
            ("tfidf", TfidfVectorizer(), 'cleaned_text')]
    )

    res = {}
    for model in models:
        model_name = type(model).__name__
        
        pl = Pipeline([
                    ('preprocessor', preprocessor),
                    ('clf', OneVsRestClassifier(model, n_jobs=1)),
                ])
        
        pl.fit(X_train, y_train)
        preds = pl.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        matrix = confusion_matrix(y_test, preds)
        stats = precision_recall_fscore_support(y_test, preds,average='binary')

        model_res = {'Accuracy':accuracy,
                     'Confusion Matrix':matrix,
                     'Precision':stats[0],
                     'Recall':stats[1],
                     'F1 score':stats[2]}
        res[model_name] = model_res

    return res

In [12]:
import json

In [13]:
with open('/Users/yunyihuang/Desktop/DSC180A-Q1-Project/config/param-A.json') as fh:
    paramA = json.load(fh)

In [14]:
paramA

{'svc': {'C': 1, 'kernel': 'rbf', 'random_state': 1192},
 'rf': {'n_estimators': 100,
  'criterion': 'gini',
  'max_depth': 30,
  'random_state': 1192}}

In [15]:
filepath = '/Users/yunyihuang/Desktop/DSC180A-Q1-Project/notebooks/SentimentLabeled_10112022.csv'
RB_classifier(filepath, paramA['svc'], paramA['rf'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['text'].apply(process_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bkt.drop_duplicates(inplace=True, ignore_index=True)


{'SVC': {'Accuracy': 0.6991666666666667,
  'Confusion Matrix': array([[421, 166],
         [195, 418]]),
  'Precision': 0.7157534246575342,
  'Recall': 0.6818923327895595,
  'F1 score': 0.6984126984126984},
 'RandomForestClassifier': {'Accuracy': 0.6975,
  'Confusion Matrix': array([[412, 175],
         [188, 425]]),
  'Precision': 0.7083333333333334,
  'Recall': 0.6933115823817292,
  'F1 score': 0.7007419620774938}}

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from make_dataset import load_data
from build_features import get_features

def SS_regressor(filepath, gb_params, en_params):
    """model for the second task

    Args:
        filepath (str): filepath of the raw data file
        gb_params (dict): tuned parameters for Gradient Boosting
        en_params (dict): tuned parameters for Elastic Net

    Returns:
        dict: dictionary of models' performance
    """
    data = load_data(filepath)
    X, y = get_features(data, 2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = [GradientBoostingRegressor(**gb_params), 
              LGBMRegressor(),
              ElasticNet(**en_params)]

    preprocessor = ColumnTransformer(
        transformers=[
            ("bog", CountVectorizer(), 'cleaned_text'),
            ("tfidf", TfidfVectorizer(), 'cleaned_text')]
    )

    res = {}
    for model in models:
        model_name = type(model).__name__
        
        pl = Pipeline([
                    ('preprocessor', preprocessor),
                    ('reg', model),
                ])
        
        pl.fit(X_train, y_train)
        preds = pl.predict(X_test)
        mse = mean_squared_error(y_test, preds)

        model_res = {'Mean Squared Error':mse}
        res[model_name] = model_res

    return res

In [17]:
with open('/Users/yunyihuang/Desktop/DSC180A-Q1-Project/config/param-B.json') as fh:
    paramB = json.load(fh)

In [19]:
example_res = SS_regressor(filepath, paramB['gb'], paramB['en'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['text'].apply(process_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ss.drop_duplicates(inplace=True, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ss.dropna(inplace=True)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [20]:
with open('result.json', 'w') as fp:
    json.dump(example_res, fp)