In [46]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Data

In [47]:
# kaggle functions
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [49]:
train_df = pd.DataFrame(pd.read_csv('../resources/combined_data/train_with_text_labels.csv'))
test_df = pd.read_csv('../resources/combined_data/test_with_text.csv')
train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,all_labels
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,what is this study about this study used data ...,education longitudinal study|national educatio...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,november 2004 dropping out of high school is n...,education longitudinal study|national educatio...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,differences in outcomes for female and male st...,education longitudinal study|national educatio...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,abstract federal reserve bank of richmond s1 a...,education longitudinal study|national educatio...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,abstract this article investigates an importan...,education longitudinal study|national educatio...


## Baseline model

In [50]:
# add all dataset labels in training set to list 
dataset_labels = [x.lower() for x in train_df['dataset_label'].unique()]
dataset_titles = [x.lower() for x in train_df['dataset_title'].unique()]
cleaned_labels = [x.lower() for x in train_df['cleaned_label'].unique()]

label_list = set(dataset_labels + dataset_titles + cleaned_labels)

# check if in test text and add to submission dataframe 
labels = []
for index in test_df['Id']:
    text = test_df[test_df['Id'] == index].text.str.cat(sep='\n').lower()
    label = []
    for dataset_title in label_list:
        if dataset_title in text:
            label.append(clean_text(dataset_title))
    labels.append('|'.join(label))

test_df['PredictionString'] = labels

submission_df = test_df.drop(columns=['text']).set_index('Id')

submission_df

Unnamed: 0_level_0,PredictionString
Id,Unnamed: 1_level_1
2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
2f392438-e215-4169-bebf-21ac4ff253e1,trends in international mathematics and scienc...
3f316b38-1a24-45a9-8d8c-4e05a42257c6,noaa storm surge inundation|slosh model
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


##### Kaggle score: 6.698
Baseline

In [19]:
# submission_df.to_csv('../output/submission1.csv')

## ML approach

#### Feature engineering

In [51]:
X = train_df['text']
y = train_df['all_labels']
print(X.shape, y.shape)

(19661,) (19661,)


In [70]:
# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.92, random_state=42)

In [71]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
# # Initialize CountVectorizer
# count_vec = CountVectorizer(stop_words='english', max_df=0.7) 
# # Fit and transform training data 
# count_vec_train = count_vec.fit_transform(X_train2) 
# # Transform test set 
# count_vec_test = count_vec.transform(X_test2)
# # Initialize tfidf_vectorizer
# tfidf_transformer = TfidfTransformer() 
# # Fit and transform training data 
# tfidf_train = tfidf_transformer.fit_transform(count_vec_train) 
# # Transform test set 
# tfidf_test = tfidf_transformer.transform(count_vec_test)
# # tfidf_df = pd.DataFrame(tfidf_train.A, columns=TfidfTransformer.get_feature_names())
# # tfidf_df

In [77]:
# more functions
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, confusion_matrix

def model_test(model):
    '''Quickly creates pipeline and displays scores for testing'''
    
    clf_pipe = Pipeline([
#         ('vect', CountVectorizer(stop_words='english', max_df=0.7)),
#         ('tfidf', TfidfTransformer()),
        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
        ('clf', model),
    ])

    clf_pipe.fit(X_train, y_train)
    
    print(model)

    score = clf_pipe.score(X_test, y_test)
    print("Score:", score)
    
def create_submit(model):
    '''Fits training data and makes sumbission predictions'''
    clf_pipe = Pipeline([
        ('vect', CountVectorizer(stop_words='english', max_df=0.7)),
        ('tfidf', TfidfTransformer()),
        ('clf', model),
    ])
    # Fit and transform data 
    clf_pipe.fit(X_train, y_train)
    # Make predictions
    pred = clf_pipe.predict(test_df.text)
    # add dataset labels preds to submission DF
    test_df['PredictionString'] = pred
    submission_df = test_df.drop(columns=['text']).set_index('Id')
    return submission_df

#### Passive Aggressive

In [78]:
from sklearn.linear_model import PassiveAggressiveClassifier 
clf = PassiveAggressiveClassifier(validation_fraction=.99)
model_test(clf)

# Score: 0.7782750203417412
# Score: 0.7925775292323335
# Score: 0.7925775292323335
# Not warm: Score: 0.7956278596847992
# high val_frac and lower test size: 0.7965670692943421
# .08 test, .99 val_frac: 0.7981118373275236

# .776 without Tfidf
# .778 without count
# .778 with both
# 0.7940241576605213

PassiveAggressiveClassifier(validation_fraction=0.99)
Score: 0.7921169739351558


#### SGD

In [63]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
model_test(sgd_model)

SGDClassifier()
Score: 0.7144019528071603


#### SVM

In [55]:
## Takes hours!
# from sklearn.svm import SVC

# model_test(SVC())

# Score: 0.7054515866558178

#### One Vs Rest testing

In [56]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.multiclass import OneVsRestClassifier

# log_reg_clf = OneVsRestClassifier(LogisticRegression())

# model_test(log_reg_clf)

In [57]:
# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.svm import SVC

# clf = OneVsRestClassifier(SVC())
# model_test(clf)

# Score: 0.7597640358014646

#### Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model_test(rfc)

RandomForestClassifier()
Score: 0.75


#### Multi Output Classifier testing

In [59]:
# from sklearn.multioutput import MultiOutputClassifier

# rfc = RandomForestClassifier(n_estimators=100, random_state=1)
# pac = PassiveAggressiveClassifier()
# clf = MultiOutputClassifier(pac)

# clf.fit(tfidf_train, y_train2)
# print(clf)

# score = clf.score(tfidf_test, y_test2)
# print("Score:", score)

# Score: 0.7701383238405207

#### KNN

In [61]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=20)
model_test(knn_model)

# KNeighborsClassifier(n_neighbors=20)
# Score: 0.5502441008950366

KNeighborsClassifier(n_neighbors=20)
Score: 0.5006102522375916


#### Testing KNN params

### ML submission 1

In [22]:
clf = PassiveAggressiveClassifier(validation_fraction=.99)
submission_df = create_submit(clf)
submission_df

Unnamed: 0_level_0,PredictionString
Id,Unnamed: 1_level_1
2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...
2f392438-e215-4169-bebf-21ac4ff253e1,nces common core of data|trends in internation...
3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [24]:
# submission_df.to_csv('../output/submission_pac.csv')

##### Max ML Kaggle score: .639
Using the PassiveAggressiveClassifier