In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Data

In [3]:
# kaggle functions
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [4]:
train_df = pd.DataFrame(pd.read_csv('../resources/combined_data/train_with_text.csv'))
test_df = pd.read_csv('../resources/combined_data/test_with_text.csv')
train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,all_labels
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title what is this study about text th...,national education longitudinal study|educatio...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title november 2004 text dropping out ...,national education longitudinal study|educatio...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title differences in outcomes for fema...,national education longitudinal study|educatio...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text federal reserve ba...,national education longitudinal study|educatio...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text this article inves...,national education longitudinal study|educatio...


In [5]:
# both_labels = train_df['all_labels'].str.split('|', n=1, expand = True)
# train_df['label_1'] = both_labels[0]
# train_df['label_2'] = both_labels[1]
# train_df['label_2'] = train_df['label_2'].fillna(' ',inplace=True)
# train_df['label_2'] = train_df['label_2'].replace('None', '', inplace=True)
# train_df 

In [6]:
# train_df['label_2'] = np.where(train_df['label_2']==train_df['label_1'], '', train_df['label_2'])
# train_df

## Submission 1 (cheating)

In [7]:
# add all dataset labels in training set to list 
datasets_titles = train_df.cleaned_label.unique()

# check if in test text and add to submission dataframe 
labels = []
for index in test_df['Id']:
    text = test_df[test_df['Id'] == index].text.str.cat(sep='\n').lower()
    label = []
    for dataset_title in datasets_titles:
        if dataset_title in text:
            label.append(clean_text(dataset_title))
    labels.append('|'.join(label))

test_df['PredictionString'] = labels

submission_df = test_df.drop(columns=['text']).set_index('Id')

submission_df

Unnamed: 0_level_0,PredictionString
Id,Unnamed: 1_level_1
2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...
2f392438-e215-4169-bebf-21ac4ff253e1,nces common core of data|trends in internation...
3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [8]:
# submission_df.to_csv('../output/submission1.csv')

## ML approach

#### Feature engineering

In [9]:
X = train_df['text']
y = train_df['all_labels']
# y2 = train_df[['label_1','label_2']]

print(X.shape, y.shape)

(19661,) (19661,)


In [10]:
# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
# Initialize CountVectorizer
count_vec = CountVectorizer(stop_words='english', max_df=0.7) 
# Fit and transform training data 
count_vec_train = count_vec.fit_transform(X_train) 
# Transform test set 
count_vec_test = count_vec.transform(X_test)
# Initialize tfidf_vectorizer
tfidf_transformer = TfidfTransformer() 
# Fit and transform training data 
tfidf_train = tfidf_transformer.fit_transform(count_vec_train) 
# Transform test set 
tfidf_test = tfidf_transformer.transform(count_vec_test)
# tfidf_df = pd.DataFrame(tfidf_train.A, columns=TfidfTransformer.get_feature_names())
# tfidf_df

In [12]:
# more functions
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, confusion_matrix

def model_test(model):
    '''Quickly creates pipeline and displays scores for testing'''
    
    clf_pipe = Pipeline([
        ('vect', CountVectorizer(stop_words='english', max_df=0.7)),
        ('tfidf', TfidfTransformer()),
        ('clf', model),
    ])

    clf_pipe.fit(X_train, y_train)
    
    print(model)

    score = clf_pipe.score(X_test, y_test)
    print("Score:", score)

    pred = clf_pipe.predict(X_test)

    print("Precision score: ",precision_score(y_test, pred, pos_label='positive', average='micro'))
    print("Recall score:", recall_score(y_test, pred, pos_label='positive', average='micro'))
    print('-----------------')
    
def create_submit(model):
    '''Fits training data and makes sumbission predictions'''
    clf_pipe = Pipeline([
        ('vect', CountVectorizer(stop_words='english', max_df=0.7)),
        ('tfidf', TfidfTransformer()),
        ('clf', model),
    ])
    # Fit and transform data 
    clf_pipe.fit(X_train, y_train)
    # Make predictions
    pred = clf_pipe.predict(test_df.text)
    # add dataset labels preds to submission DF
    test_df['PredictionString'] = pred
    submission_df = test_df.drop(columns=['text']).set_index('Id')
    return submission_df

#### SGD

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

sgd_model = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)

model_test(sgd_model)

SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None)
Score: 0.654393816110659
Precision score:  0.654393816110659
Recall score: 0.654393816110659
-----------------


#### SGD gird search

#### OneVsRest grid search

In [87]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.multiclass import OneVsRestClassifier

# params = [{'estimator__C': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}]

# log_reg_clf = OneVsRestClassifier(LogisticRegression())

# logistic_gs = GridSearchCV(log_reg_clf, params, scoring='f1_micro', cv=3)

# logistic_gs.fit(tfidf_train, y_train)
# print(logistic_gs.best_estimator_)

#### Random Forest testing

In [14]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.multioutput import MultiOutputClassifier

# rfc = RandomForestClassifier(n_estimators=100, random_state=1)
# clf = MultiOutputClassifier(rfc, n_jobs=-1)
# clf.fit(tfidf_train, y_train)

# score = clf.score(tfidf_test, y_test)
# print("Score:", score)

# pred = clf.predict(tfidf_test)

# print("Precision score: ",precision_score(y_test, pred, pos_label='positive', average='micro'))
# print("Recall score:", recall_score(y_test, pred, pos_label='positive', average='micro'))
# print('-----------------')

#### Passive Aggressive

In [15]:
from sklearn.linear_model import PassiveAggressiveClassifier 
model_test(PassiveAggressiveClassifier())

PassiveAggressiveClassifier()
Score: 0.7786818551668023
Precision score:  0.7786818551668023
Recall score: 0.7786818551668023
-----------------


#### KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=20)
model_test(knn_model)

KNeighborsClassifier(n_neighbors=20)
Score: 0.5502441008950366
Precision score:  0.5502441008950366
Recall score: 0.5502441008950366
-----------------


#### Naive Bayes
##### Multinomial

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
multinom_nb = MultinomialNB()
model_test(multinom_nb)

MultinomialNB()
Score: 0.31061838893409277
Precision score:  0.31061838893409277
Recall score: 0.31061838893409277
-----------------


#### Naive Bayes
##### Bernoulli

In [18]:
from sklearn.naive_bayes import BernoulliNB
bernoulli_nb = BernoulliNB()
model_test(bernoulli_nb)

BernoulliNB()
Score: 0.35862489829129374
Precision score:  0.35862489829129374
Recall score: 0.35862489829129374
-----------------


#### Testing KNN params

In [19]:
# # Iterate through different k values to see which has the highest accuracy
# train_scores = []
# test_scores = []
# for k in range(1, 20, 2):
#     clf = KNeighborsClassifier(n_neighbors=k)
#     clf.fit(tfidf_train, y_train)
#     train_score = clf.score(tfidf_train, y_train)
#     test_score = clf.score(tfidf_test, y_test)
#     train_scores.append(train_score)
#     test_scores.append(test_score)
    
# plt.plot(range(1, 20, 2), train_scores, marker='o')
# plt.plot(range(1, 20, 2), test_scores, marker="x")
# plt.xlabel("k neighbors")
# plt.ylabel("Testing accuracy Score")
# plt.show()

##### K=15

In [21]:
# Create KNN classifer and fit it to training data
knn_model = KNeighborsClassifier(n_neighbors=15)
model_test(knn_model)

KNeighborsClassifier(n_neighbors=15)
Score: 0.5598047192839707
Precision score:  0.5598047192839707
Recall score: 0.5598047192839707
-----------------


### ML submission 1

In [22]:
submission_df = create_submit(PassiveAggressiveClassifier())
submission_df

Unnamed: 0_level_0,PredictionString
Id,Unnamed: 1_level_1
2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...
2f392438-e215-4169-bebf-21ac4ff253e1,nces common core of data|trends in internation...
3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [24]:
submission_df.to_csv('../output/submission_pac.csv')