In [1]:
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, StratifiedKFold, GroupKFold, LeavePGroupsOut
import numpy as np
import pandas as pd

Why do we split data for training and testing separately?

* See [examples](https://arxiv.org/pdf/2109.06827.pdf)
* I.I.D. (VERY IMPORTANT!!!)

## Data Splitting
Train/test split by  `train_test_split`:
* By default, `train_test_split` splits the data into 75% training data and 25% test data which is a good rule of thumb.
* `stratify=y` makes sure that the labels to be distributed in train and test sets as they are in the original dataset.

In [10]:
data = [("This book turned out to be a dull and uninteresting depiction of what could have been a fascinating dive into history.", 
      'Negative', 
      'Amazon'),
     ("The Apple Watch is an exceptional piece of technology that seamlessly blends style with functionality.",  
      'Positive', 
      'Amazon'),
     ("I was excited to see this film because I love historical dramas, but it was a huge disappointment. The storyline was disjointed, and it seemed like the director was trying too hard to be artsy. The acting was mediocre at best, and I found myself checking my watch multiple times throughout. Overall, a very underwhelming experience.", 
      'Negative' , 
      'Netflix'),
     ("This movie is a true gem. The storyline was gripping from start to finish, filled with unexpected twists and turns. The performances were top-notch, with the lead actors delivering some of their career-best performances. The cinematography was beautiful, and the soundtrack perfectly complemented the mood of the film. It's a must-watch for anyone who appreciates quality cinema.", 
      'Positive',
      'Netflix'),
      ("...",
       'Negative' , 
      'YouTube'),
     ("...",
      'Positive',
      'YouTube'),
      ("...",
       'Negative' , 
      'Yelp'),
     ("...",
      'Positive',
      'Yelp'),  ]

df = pd.DataFrame(data, columns=['review', 'sentiment', 'source'])
y = df['sentiment']
X = df.drop(columns=['sentiment'])
X_train, X_test = train_test_split(data, test_size=0.5, random_state=42)
X_train


[('I was excited to see this film because I love historical dramas, but it was a huge disappointment. The storyline was disjointed, and it seemed like the director was trying too hard to be artsy. The acting was mediocre at best, and I found myself checking my watch multiple times throughout. Overall, a very underwhelming experience.',
  'Negative',
  'Netflix'),
 ('...', 'Negative', 'YouTube'),
 ("This movie is a true gem. The storyline was gripping from start to finish, filled with unexpected twists and turns. The performances were top-notch, with the lead actors delivering some of their career-best performances. The cinematography was beautiful, and the soundtrack perfectly complemented the mood of the film. It's a must-watch for anyone who appreciates quality cinema.",
  'Positive',
  'Netflix'),
 ('...', 'Negative', 'Yelp')]

## Cross Validation
* KFold, GroupKFold
* [Visualizing cross-validation behavior in scikit-learn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html)
<!-- * [Caveats of Cross-validation](https://www.bing.com/search?q=7+cross+validation+mistaks&cvid=de14ffd0df5042cc883c14bea38f3da1&aqs=edge..69i57j0j69i11004.12424j0j4&FORM=ANAB01&PC=NSJS) -->


In [14]:
# cv = KFold(n_splits=2, random_state=42, shuffle=True)
# cv = LeaveOneOut()
# cv = GroupKFold(n_splits=2)
cv = LeavePGroupsOut(n_groups=2)
# cv = StratifiedKFold(n_splits=2)

print(next(cv.split(X, groups=[0, 0, 1, 1, 2, 2,3,3])))

# Do not use list for big data
# The combinatorial complexity for all the possible splitting combinations. For example, the group size is 2145. If you choose 5 from 2145, the number of all the possible combinations would be 
# math.comb(2145, 5) # => 376642337375304 
# print(list(cv.split(X, groups=[0, 0, 1, 1, 2, 2,3,3])))



(array([4, 5, 6, 7]), array([0, 1, 2, 3]))


In [28]:
# read data/data_5.csv
import pandas as pd
df = pd.read_csv('data/data_5.csv')
print(df['property_id'].nunique())
print(len(df))

2145
3000


<!-- ## Hyperparameter Tuning
Methods for Data Split
* Subsampling
* Stratified subsampling
* [Grid Search](https://scikit-learn.org/stable/modules/grid_search.html) -->