In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import string
import time

In [2]:
def loop(max_thresh, min_thesh):
    # Import datasets

    train = pd.read_csv('../input/train.csv').dropna()
    test = pd.read_csv('../input/test.csv')
    #sample = pd.read_csv('../input/sample_submission.csv')

    train['text'] = train['text'].astype(str)
    test['text'] = test['text'].astype(str)
    train['text'] = train['text'].apply(lambda x: x.lower())
    test['text'] = test['text'].apply(lambda x: x.lower())

    # Make training/test split
    from sklearn.model_selection import train_test_split

    X_train, X_val = train_test_split(
        train, train_size = 0.80, random_state = 0)

    pos_train = X_train[X_train['sentiment'] == 'positive']
    neutral_train = X_train[X_train['sentiment'] == 'neutral']
    neg_train = X_train[X_train['sentiment'] == 'negative']
    
    
    
    cv = CountVectorizer(max_df=max_thresh, min_df=min_thresh,
                                     max_features=10000,
                                     stop_words='english')

    X_train_cv = cv.fit_transform(X_train['text'])

    X_pos = cv.transform(pos_train['text'])
    X_neutral = cv.transform(neutral_train['text'])
    X_neg = cv.transform(neg_train['text'])

    pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
    neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
    neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())
    
    
    
    # Use CountVectorizer to get the word counts within each dataset

    cv = CountVectorizer(max_df=max_thresh, min_df=min_thresh,
                                         max_features=10000,
                                         stop_words='english')

    X_train_cv = cv.fit_transform(X_train['text'])

    X_pos = cv.transform(pos_train['text'])
    X_neutral = cv.transform(neutral_train['text'])
    X_neg = cv.transform(neg_train['text'])

    pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
    neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
    neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())

    # Create dictionaries of the words within each sentiment group, where the values are the proportions of tweets that 
    # contain those words

    pos_words = {}
    neutral_words = {}
    neg_words = {}

    for k in cv.get_feature_names():
        pos = pos_count_df[k].sum()
        neutral = neutral_count_df[k].sum()
        neg = neg_count_df[k].sum()

        pos_words[k] = pos/pos_train.shape[0]
        neutral_words[k] = neutral/neutral_train.shape[0]
        neg_words[k] = neg/neg_train.shape[0]

    # We need to account for the fact that there will be a lot of words used in tweets of every sentiment.  
    # Therefore, we reassign the values in the dictionary by subtracting the proportion of tweets in the other 
    # sentiments that use that word.
    
    neg_words_adj = {}
    pos_words_adj = {}
    neutral_words_adj = {}

    for key, value in neg_words.items():
        if neg_words[key] - (neutral_words[key] + pos_words[key]) > 0:
            neg_words_adj[key] = neg_words[key] - (neutral_words[key] + pos_words[key]) 
        else:
            neg_words_adj[key] = 0

    for key, value in pos_words.items():
        if pos_words[key] - (neutral_words[key] + neg_words[key]) > 0:
            pos_words_adj[key] = pos_words[key] - (neutral_words[key] + neg_words[key])
        else:
            neg_words_adj[key] = 0

    for key, value in neutral_words.items():
        neutral_words_adj[key] = neutral_words[key] - (neg_words[key] + pos_words[key])
        
    def calculate_selected_text(df_row, tol = 0):
    
        tweet = df_row['text']
        sentiment = df_row['sentiment']

        if(sentiment == 'neutral'):
            return tweet

        elif(sentiment == 'positive'):
            dict_to_use = pos_words_adj # Calculate word weights using the pos_words dictionary
        elif(sentiment == 'negative'):
            dict_to_use = neg_words_adj # Calculate word weights using the neg_words dictionary

        words = tweet.split()
        words_len = len(words)
        subsets = [words[i:j+1] for i in range(words_len) for j in range(i,words_len)]

        score = 0
        selection_str = '' # This will be our choice
        lst = sorted(subsets, key = len) # Sort candidates by length


        for i in range(len(subsets)):

            new_sum = 0 # Sum for the current substring

            # Calculate the sum of weights for each word in the substring
            for p in range(len(lst[i])):
                if(lst[i][p].translate(str.maketrans('','',string.punctuation)) in dict_to_use.keys()):
                    new_sum += dict_to_use[lst[i][p].translate(str.maketrans('','',string.punctuation))]

            # If the sum is greater than the score, update our current selection
            if(new_sum > score + tol):
                score = new_sum
                selection_str = lst[i]
                #tol = tol*5 # Increase the tolerance a bit each time we choose a selection

        # If we didn't find good substrings, return the whole text
        if(len(selection_str) == 0):
            selection_str = words

        return ' '.join(selection_str)


    tol = 0.001

    X_val['predicted_selection'] = ''

    for index, row in X_val.iterrows():

        selected_text = calculate_selected_text(row, tol)

        X_val.loc[X_val['textID'] == row['textID'], ['predicted_selection']] = selected_text
        
    X_val['jaccard'] = X_val.apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)
    time.sleep(90)
    return np.mean(X_val['jaccard'])

In [3]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [8]:
min_threshs

array([0.05 , 0.055, 0.06 , 0.065, 0.07 , 0.075, 0.08 , 0.085, 0.09 ,
       0.095, 0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135,
       0.14 , 0.145, 0.15 , 0.155, 0.16 , 0.165, 0.17 , 0.175, 0.18 ,
       0.185, 0.19 , 0.195])

In [9]:
max_threshs = 0.6 + np.arange(10) * 0.05
min_threshs = np.arange(1,4)

max_l = []
min_l = []
score_l = []
from tqdm import tqdm

for max_thresh in tqdm(max_threshs):
    for min_thresh in min_threshs:
        try:
            score = loop(max_thresh, min_thresh)
            max_l.append(max_thresh)
            min_l.append(min_thresh)
            score_l.append(score)
        except:
            continue
        
pd.DataFrame({'max':max_l, 'min':min_l, 'score':score_l})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 10/10 [1:05:25<00:00, 392.60s/it][A


Unnamed: 0,max,min,score
0,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
1,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
2,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
3,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
4,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
5,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
6,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
7,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
8,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
9,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009


In [10]:
pd.DataFrame({'max':max_l, 'min':min_l, 'score':score_l})

Unnamed: 0,max,min,score
0,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
1,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
2,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
3,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
4,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
5,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
6,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009
7,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",2,0.629009
8,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",3,0.628883
9,"[0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1...",1,0.629009


In [12]:
import tokenizers
import os 
import pandas as pd
import torch
import transformers
ROBERT_TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file="../input/vocab.json",
    merges_file="../input/merges.txt",
    lowercase=True
)
TOKENIZER = ROBERT_TOKENIZER
#roberta-base
MODEL_PATH = 'roberta-base'

In [16]:
TOKENIZER.encode('fucking stupid charlie').offsets

[(0, 1), (1, 7), (7, 14), (14, 19), (19, 22)]