# Indus Valley Script- Text Analysis for Decipherment 

## Input data processing and data cleanup

This file is used to process the Input data, clean it up and create various pickled dataframes.
Dataset was created as a csv file from ICIT web site from raw html files of ICIT code for each for the Text
Data labels were changes and a linearized copy of the original text was added

### Input:
icit_text_text_corpus.csv and icit_sign_corpus.csv are the input csv

### Output:
Various Pickled dataframes

!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections
import random
import traceback
import pickle
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from collections import defaultdict

In [3]:
drop_duplicate_texts = True

# Set the filters on data here
filter_by_site = False
filter_by_keywords = False
filter_by_text_length= False

#site = 'Mohenjo-daro'
#site = 'Harappa'
#site = 'Dholavira'
#site = 'Rakhigarhi'
#keyword = "Bull"
#keyword = "Gaur"

min_text_length=1
max_text_length=50

num_rows_text_corpus= 4999

In [4]:
"""Read the signs"""
orig_sign_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_sign_corpus.csv',dtype=str)

"""Set the max columns to none"""
pd.set_option('display.max_columns', None)

print(len(orig_sign_df))

"""Pickle it"""
orig_sign_df.to_pickle('pickle/orig_sign_df.pkl')

709


In [5]:
"""Read the Text Corpus"""
orig_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_text_text_corpus.csv',dtype=str, nrows=num_rows_text_corpus)
# set the max columns to none
pd.set_option('display.max_columns', None)


In [6]:
""" Reverse text and add that as a new column"""
""" Add text length as a column """
list_reversed_text = []
for text in orig_df[orig_df.l_to_r_text!=''].l_to_r_text:
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')
    length = len(chars)
    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    list_reversed_text.append(reversed_text)
    
orig_df['reversed_text']= list_reversed_text #same as r_to_l text
orig_df['text_length']= orig_df['l_to_r_text'].str.len().div(3).round()

print("Dataframe has ", len(orig_df.index), " rows")

print(orig_df)

"""Pickle it"""
orig_df.to_pickle('pickle/upd_orig_df.pkl')


Dataframe has  4999  rows
     icit_id        site keywords text_class lines direction       text signs  \
0          1  Alamgirpur      NaN         SS     1       L/R  +410-017+     2   
1          2  Alamgirpur      NaN         SS     1       L/R  +410-017+     2   
2          3  Alamgirpur      NaN         SC     1       L/R  +405-017+     2   
3          4   Allahdino      NaN         ??     1       NaN  +220-000+     1   
4          5   Allahdino     Bull         UC     1       R/L  +740-235+     2   
...      ...         ...      ...        ...   ...       ...        ...   ...   
4994    4064     Harappa      NaN         UC     1       NaN      +000[     0   
4995    4065     Harappa      NaN         VN     1       R/L  ]700-032[     2   
4996    4065     Harappa      NaN         UC     1       R/L  ]000-000[     0   
4997    4066     Harappa      NaN         UC     1       R/L  +368-000+     1   
4998    4066     Harappa      NaN         VN     1       R/L  +700-033+     2   

 

In [7]:
df = orig_df.copy()

if(filter_by_site==True):
    #keep only the values that matches the provided site
    df = df[df['site'].str.contains(site) == True] 
    print("After filtering by site ", site, " it has ", len(df.index), " rows")

if(filter_by_keywords==True):
     #keep only the values that matches the provided keyword
    df = df[df['keywords'].str.contains(keyword) == True] 
    print("After filtering by keywords ", keyword, " it has ", len(df.index), " rows")

if(filter_by_text_length==True):
    df = df[(df['text_length'] > min_text_length) & (df['text_length']< max_text_length)]
    print("After filtering by text_length ",  " it has ", len(df.index), " rows")


df_filtered = df.copy()

In [8]:
""" Retain texts that are only wanted """

""" remove the values where the text is unclear"""
df = df[df['l_to_r_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    
    # We will consider a text duplicate only of the keywords(pics) are different
    # In that case we will retain the first occurance of it
    df = df.drop_duplicates(subset =["text", "keywords", "site"], inplace = False, keep = "first")
    print("After removing duplicate texts, we have ", len(df.index), " rows")


#keep only the values that does not have multi-line text
df = df[df['text'].str.contains('/') == False] 

print("After removing multi-line text, we have ", len(df.index), " rows")


#Single sign ones don't have direction and won't have /, wo we need to explicitly include it
#Btw standardized_text is Left to right as in English
df = df[(df['direction'].str.contains('/') == True) | (df['text_length'] ==1)] 

print("After keeping only text with known direction, we have ", len(df.index), " rows")

#Remove Multipart texts that have [ or ]
df = df[df['text'].str.contains("\[") == False] 
df = df[df['text'].str.contains("\]") == False] 

print("After keeping only text without multipart, we have ", len(df.index), " rows")

"""Pickle it"""
df.to_pickle('pickle/clean_df.pkl')

After removing unclear texts, we have  3945  rows
After removing duplicate texts, we have  2856  rows
After removing multi-line text, we have  2778  rows
After keeping only text with known direction, we have  2646  rows
After keeping only text without multipart, we have  2223  rows


In [9]:
""" of those whose direction is know print out L/R and L/R text count"""
df_l_r = df[df['direction'].str.contains('L/R') == True] 

print("L/R texts: ", len(df_l_r.index))

df_r_l = df[df['direction'].str.contains('R/L') == True]

print("R/L texts: ", len(df_r_l.index))

L/R texts:  97
R/L texts:  1985


# Indus core region Texts

In [10]:
"""Keep Indus core region site texts, removing others"""
indus_non_core_sites = ['Altin Yepe', 'Hajar', 'Ra\'s al-Junayz', 'Qala\'at al-Bahrain', 'Failaka', 'Kish', 'Nippur','Luristan', 'Susa', 'Altyn Depe', 'Tepe Yahya', 'Tello', 'Ur', 'Tell Umma', 'Gonur Depe']
    
df_indus_core_region = df.copy()
    
for indus_non_core_site in indus_non_core_sites:
    df_indus_core_region = df_indus_core_region[df_indus_core_region['site'].str.contains(indus_non_core_site) == False]
    
print("After removing Indus non-core region  site ", indus_non_core_sites , " it has ", len(df_indus_core_region.index), " rows")


After removing Indus non-core region  site  ['Altin Yepe', 'Hajar', "Ra's al-Junayz", "Qala'at al-Bahrain", 'Failaka', 'Kish', 'Nippur', 'Luristan', 'Susa', 'Altyn Depe', 'Tepe Yahya', 'Tello', 'Ur', 'Tell Umma', 'Gonur Depe']  it has  2213  rows


# Indus non core region Texts

In [11]:
"""Store Indus non-core region texts"""
#indus_non_core_sites = ['Altin Yepe', 'Hajar', 'Ra\'s al-Junayz', 'Qala\'at al-Bahrain', 'Failaka', 'Kish', 'Nippur','Luristan', 'Susa', 'Altyn Depe', 'Tepe Yahya', 'Tello']
    
df_indus_non_core_region = None
    
for indus_non_core_site in indus_non_core_sites:
    df_temp = df[df['site'].str.contains(indus_non_core_site) == True]
    frames = [df_temp, df_indus_non_core_region]
    df_indus_non_core_region  = pd.concat(frames)
    
print("Indus non-core region site texts", indus_non_core_sites , " has ", len(df_indus_non_core_region.index), " rows")
print(df_indus_non_core_region)

df_indus_non_core_region.to_pickle('pickle/non_core_df.pkl')


Indus non-core region site texts ['Altin Yepe', 'Hajar', "Ra's al-Junayz", "Qala'at al-Bahrain", 'Failaka', 'Kish', 'Nippur', 'Luristan', 'Susa', 'Altyn Depe', 'Tepe Yahya', 'Tello', 'Ur', 'Tell Umma', 'Gonur Depe']  has  10  rows
     icit_id                site             keywords text_class lines  \
4782    3884           Tell Umma                Bull1         PP     1   
4796    3897                  Ur                 Gaur         SC     1   
4797    3898                  Ur                 Gaur         UC     1   
15        16          Altyn Depe                  NaN         UC     1   
4780    3882                Susa                 Gaur         LC     1   
2928    2153            Luristan                 Gaur         SC     1   
2721    1971                Kish             Bull1:II         LP     1   
4758    3863  Qala'at al-Bahrain                 Gaur         SC     1   
4760    3865      Ra's al-Junayz                  NaN         SS     1   
161      160               Ha

# Unclear Texts

In [12]:
""" Keep the items with unclear text in another dataframe"""
df_unclear = df_filtered[df_filtered['l_to_r_text'].str.contains('000') == True]

print("We have", len(df_unclear.index), " rows of unclear texts")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    # TBD How can unclear text be duplicates
    df_unclear  =df_unclear.drop_duplicates(subset ="text",
                         keep = False, inplace = False)

    print("After removing duplicate texts, we have ", len(df_unclear.index), " rows")

#keep only the values that does not have multi-line text
df_unclear= df_unclear[df_unclear['text'].str.contains('/') == False] 

print("After removing multi-line text, we have ", len(df_unclear.index), " rows")


"""Single sign ones don't have direction and won't have /, wo we need to explicitly include it
 Btw standardized_text is Left to right as in English
 df = df[df['direction'].str.contains('/') == True]  """
df_unclear= df_unclear[(df_unclear['direction'].str.contains('/') == True) | (df_unclear['text_length'] ==1)] 

print("After keeping only text with known direction, we have ", len(df_unclear.index), " rows")

#Remove Multipart texts that have [ or ]
df_unclear= df_unclear[df_unclear['text'].str.contains("\[") == False] 
df_unclear= df_unclear[df_unclear['text'].str.contains("\]") == False] 

print("After keeping only text without multipart, we have ", len(df_unclear.index), " rows")

#Note: Lot of the text with unclear text have direction empty

"""Pickle it"""
df_unclear.to_pickle('pickle/unclear_df.pkl')

We have 1054  rows of unclear texts
After removing duplicate texts, we have  763  rows
After removing multi-line text, we have  728  rows
After keeping only text with known direction, we have  586  rows
After keeping only text without multipart, we have  291  rows


# Multi Line Texts

In [13]:
""" Keep the text that are multiline (has ''/'') in another dataframe """

print("Dataframe has ", len(df_filtered.index), " rows")

""" remove the values where the text is unclear"""
df_multi_line = df_filtered[df_filtered['l_to_r_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df_multi_line.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    df_multi_line = df_multi_line.drop_duplicates(subset ="text",
                         keep = False, inplace = False)

    print("After removing duplicate texts, we have ", len(df_multi_line.index), " rows")


#keep only the values that has multi-line text
df_multi_line = df_multi_line[df_multi_line['text'].str.contains('/') == True] 

print("We have", len(df_multi_line.index), " rows of multi line texts")
print(df_multi_line.text)

df_multi_line.to_csv('multi_line_texts.csv')

"""Pickle it"""
df_multi_line.to_pickle('pickle/multi_line_df.pkl')

Dataframe has  4999  rows
After removing unclear texts, we have  3945  rows
After removing duplicate texts, we have  2130  rows
We have 77  rows of multi line texts
69                      +032-031/151-740-240-235+
71              +032-031/850-032-530-740-741-456+
72                          +032-031/740-791-713+
74                              +032/226-032-817+
80                          +740-636-240/002-817+
                          ...                    
4386                    +621/090-740-231-560-534+
4402                +790/740-100-415-740-257-840+
4705                        +740-900-003/741-002+
4729                                    +840/790+
4752    +605-740-142-067/002-374-310-350-495-834+
Name: text, Length: 77, dtype: object


### Feature Extraction

"""We will keep All data, All Indus Core, Train data and Test data"""

## All data

In [14]:
y_all = df['site'].values
y_all.shape

y=df['site'].values
y.shape

# y axis is still the same
y_all_rev=df['site'].values
y_all_rev.shape

y_rev=df['site'].values
y_rev.shape

(2223,)

In [15]:

x_all = np.asarray(df[['l_to_r_text', 'direction', 'icit_id']])
x_all.shape

x = np.asarray(df[['l_to_r_text', 'direction', 'icit_id']])
x.shape

x_all_rev=df['reversed_text'].values
x_all_rev.shape

x_rev=df['reversed_text'].values
x_rev.shape


(2223,)

## Indus Core region data

In [16]:
y_core = df_indus_core_region['site'].values
y_core.shape

# y axis is still the same
y_core_rev=df_indus_core_region['site'].values
y_core_rev.shape



(2213,)

In [17]:
x_core = np.asarray(df_indus_core_region[['l_to_r_text', 'direction', 'icit_id']])
x_core.shape

x_core_rev=df_indus_core_region['reversed_text'].values
x_core_rev.shape


(2213,)

### Train-test split

In [18]:
def rev(df_in):
    list_reversed_text = []
    for text in df_in.l_to_r_text:
        chars = text.split(' ')
        reversed_text = ' '.join(reversed(chars))
        list_reversed_text.append(reversed_text)
        
    new_df=pd.DataFrame()
    new_df['reversed_text']= list_reversed_text
    return new_df

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
"""Split the data into Train and Test"""
(x_train,x_test,y_train,y_test)=train_test_split(x,y,test_size=0.20, random_state=43)

#(x_rev_train,x_rev_test,y_rev_train,y_rev_test)=train_test_split(x_rev,y_rev,test_size=0.25, random_state=43)

In [21]:
"""Split the Indus Core region data into Train and Test"""
(x_core_train,x_core_test,y_core_train,y_core_test)=train_test_split(x_core,y_core,test_size=0.20, random_state=43)


In [22]:
"""Build dataframe out of train and test data"""

df_train_x1=pd.DataFrame(x_train)
df_train_x1=df_train_x1.rename(columns={0:'l_to_r_text'})
df_train_x1_rev = rev(df_train_x1)
x_rev_train = np.asarray(df_train_x1_rev[['reversed_text']])
#print("_____x_train:", x_train.shape, x_train)
#print("_____x_rev_train:", x_rev_train.shape, x_rev_train)
y_rev_train = y_train

df_test_x1=pd.DataFrame(x_test)
df_test_x1=df_test_x1.rename(columns={0:'l_to_r_text'})
df_test_x1_rev = rev(df_test_x1)
x_rev_test = np.asarray(df_test_x1_rev[['reversed_text']])
#print("_____x_test:", x_test.shape, x_test)
#print("_____x_rev_test:", x_rev_test.shape, x_rev_test)

y_rev_test = y_test

In [23]:
"""Build dataframe out of Indus Core Region train and test data"""

df_core_train_x1=pd.DataFrame(x_core_train)
df_core_train_x1=df_core_train_x1.rename(columns={0:'l_to_r_text'})
df_core_train_x1_rev = rev(df_core_train_x1)
x_rev_core_train = np.asarray(df_core_train_x1_rev[['reversed_text']])
#print("_____x_core_train:", x_core_train.shape, x_core_train)
#print("_____x_rev_core_train:", x_rev_core_train.shape, x_rev_core_train)
y_rev_core_train = y_core_train

df_core_test_x1=pd.DataFrame(x_core_test)
df_core_test_x1=df_core_test_x1.rename(columns={0:'l_to_r_text'})
df_core_test_x1_rev = rev(df_core_test_x1)
x_rev_core_test = np.asarray(df_core_test_x1_rev[['reversed_text']])
#print("_____x_core_test:", x_core_test.shape, x_core_test)
#print("_____x_rev_core_test:", x_rev_core_test.shape, x_rev_core_test)

y_rev_core_test = y_core_test

In [24]:
#(x_rev_train,x_rev_test,y_rev_train,y_rev_test)=train_test_split(x_rev,y_rev,test_size=0.10, random_state=43)

In [25]:
"""All data, fwd and reverse"""
df_all_x=pd.DataFrame(x_all)
df_all_x=df_all_x.rename(columns={0:'l_to_r_text'})
df_all_x=df_all_x.rename(columns={1:'direction'})
df_all_x=df_all_x.rename(columns={2:'icit_id'})

df_all_y=pd.DataFrame(y_all)
df_all_y=df_all_y.rename(columns={0:'site'})

#rev
df_all_x_rev=pd.DataFrame(x_all_rev)
df_all_x_rev=df_all_x_rev.rename(columns={0:'reversed_text'})

df_all_y_rev=pd.DataFrame(y_all_rev)
df_all_y_rev=df_all_y_rev.rename(columns={0:'site'})

#Train data, fwd and reverse
df_train_x=pd.DataFrame(x_train)
df_train_x=df_train_x.rename(columns={0:'l_to_r_text'})
df_train_x=df_train_x.rename(columns={1:'direction'})
df_train_x=df_train_x.rename(columns={2:'icit_id'})

df_train_y=pd.DataFrame(y_train)
df_train_y=df_train_y.rename(columns={0:'site'})

#rev
df_train_x_rev=pd.DataFrame(x_rev_train)
df_train_x_rev=df_train_x_rev.rename(columns={0:'reversed_text'})


df_train_y_rev=pd.DataFrame(y_rev_train)
df_train_y_rev=df_train_y_rev.rename(columns={0:'site'})

#Test data, fwd and reverse
df_test_x=pd.DataFrame(x_test)
df_test_x=df_test_x.rename(columns={0:'l_to_r_text'})
df_test_x=df_test_x.rename(columns={1:'direction'})
df_test_x=df_test_x.rename(columns={2:'icit_id'})

df_test_y=pd.DataFrame(y_test)
df_test_y=df_test_y.rename(columns={0:'site'})

#rev
df_test_x_rev=pd.DataFrame(x_rev_test)
df_test_x_rev=df_test_x_rev.rename(columns={0:'reversed_text'})

df_test_y_rev=pd.DataFrame(y_rev_test)
df_test_y_rev=df_test_y_rev.rename(columns={0:'site'})

In [26]:
"""Indus Core Region  data - Test and Train fwd and reverse"""

"""Core Region data, fwd and reverse"""
df_core_x=pd.DataFrame(x_core)
df_core_x=df_core_x.rename(columns={0:'l_to_r_text'})
df_core_x=df_core_x.rename(columns={1:'direction'})
df_core_x=df_core_x.rename(columns={2:'icit_id'})

df_core_y=pd.DataFrame(y_core)
df_core_y=df_core_y.rename(columns={0:'site'})

#rev
df_core_x_rev=pd.DataFrame(x_core_rev)
df_core_x_rev=df_core_x_rev.rename(columns={0:'reversed_text'})

df_core_y_rev=pd.DataFrame(y_core_rev)
df_core_y_rev=df_core_y_rev.rename(columns={0:'site'})


#Train data, fwd and reverse
df_core_train_x=pd.DataFrame(x_core_train)
df_core_train_x=df_core_train_x.rename(columns={0:'l_to_r_text'})
df_core_train_x=df_core_train_x.rename(columns={1:'direction'})
df_core_train_x=df_core_train_x.rename(columns={2:'icit_id'})

df_core_train_y=pd.DataFrame(y_core_train)
df_core_train_y=df_core_train_y.rename(columns={0:'site'})

#rev
df_core_train_x_rev=pd.DataFrame(x_rev_core_train)
df_core_train_x_rev=df_core_train_x_rev.rename(columns={0:'reversed_text'})


df_core_train_y_rev=pd.DataFrame(y_rev_core_train)
df_core_train_y_rev=df_core_train_y_rev.rename(columns={0:'site'})

#Test data, fwd and reverse
df_core_test_x=pd.DataFrame(x_core_test)
df_core_test_x=df_core_test_x.rename(columns={0:'l_to_r_text'})
df_core_test_x=df_core_test_x.rename(columns={1:'direction'})
df_core_test_x=df_core_test_x.rename(columns={2:'icit_id'})

df_core_test_y=pd.DataFrame(y_core_test)
df_core_test_y=df_core_test_y.rename(columns={0:'site'})

#rev
df_core_test_x_rev=pd.DataFrame(x_rev_core_test)
df_core_test_x_rev=df_core_test_x_rev.rename(columns={0:'reversed_text'})

df_core_test_y_rev=pd.DataFrame(y_rev_core_test)
df_core_test_y_rev=df_core_test_y_rev.rename(columns={0:'site'})

In [27]:
"""Pickle the data"""
df_core_x.to_pickle('pickle/core_x.pkl')
df_core_y.to_pickle('pickle/core_y.pkl')

df_core_x_rev.to_pickle('pickle/core_x_rev.pkl')
df_core_y_rev.to_pickle('pickle/core_y_rev.pkl')

df_train_x.to_pickle('pickle/train_x.pkl')
df_train_y.to_pickle('pickle/train_y.pkl')

df_train_x_rev.to_pickle('pickle/train_x_rev.pkl')
df_train_y_rev.to_pickle('pickle/train_y_rev.pkl')

df_test_x.to_pickle('pickle/test_x.pkl')
df_test_y.to_pickle('pickle/test_y.pkl')

df_test_x_rev.to_pickle('pickle/test_x_rev.pkl')
df_test_y_rev.to_pickle('pickle/test_y_rev.pkl')

In [28]:
"""Pickle the Indus Core Region data"""

df_all_x.to_pickle('pickle/all_x.pkl')
df_all_y.to_pickle('pickle/all_y.pkl')

df_all_x_rev.to_pickle('pickle/all_x_rev.pkl')
df_all_y_rev.to_pickle('pickle/all_y_rev.pkl')


df_core_train_x.to_pickle('pickle/core_train_x.pkl')
df_core_train_y.to_pickle('pickle/core_train_y.pkl')

df_core_train_x_rev.to_pickle('pickle/core_train_x_rev.pkl')
df_core_train_y_rev.to_pickle('pickle/core_train_y_rev.pkl')

df_core_test_x.to_pickle('pickle/core_test_x.pkl')
df_core_test_y.to_pickle('pickle/core_test_y.pkl')

df_core_test_x_rev.to_pickle('pickle/core_test_x_rev.pkl')
df_core_test_y_rev.to_pickle('pickle/core_test_y_rev.pkl')

In [29]:
"""Create and Pickle All, test and train Dataframes fwd, rev """

df_all=pd.concat([df_all_x,df_all_y],axis=1)
print(df_all.head())

df_train=pd.concat([df_train_x,df_train_y],axis=1)
print(df_train.head())

df_test=pd.concat([df_test_x,df_test_y],axis=1)
print(df_test.head())

df_all_rev=pd.concat([df_all_x_rev,df_all_y_rev],axis=1)
print(df_all_rev.head())

df_train_rev=pd.concat([df_train_x_rev,df_train_y_rev],axis=1)
print(df_train_rev.head())

df_test_rev=pd.concat([df_test_x_rev,df_test_y_rev],axis=1)
print(df_test_rev.head())

"""Pickle all the dataframes we need"""
df_all.to_pickle('pickle/all_df.pkl')
df_train.to_pickle('pickle/train_df.pkl')
df_test.to_pickle('pickle/test_df.pkl')
df_all_rev.to_pickle('pickle/all_rev_df.pkl')
df_train_rev.to_pickle('pickle/train_rev_df.pkl')
df_test_rev.to_pickle('pickle/test_rev_df.pkl')

       l_to_r_text direction icit_id        site
0          410 017       L/R       1  Alamgirpur
1          405 017       L/R       3  Alamgirpur
2          235 740       R/L       5   Allahdino
3      590 390 740       R/L       6   Allahdino
4  033 125 390 368       R/L       7   Allahdino
                   l_to_r_text direction icit_id          site
0                          013        NR    2032        Lothal
1                      700 034       R/L     938       Harappa
2                  590 407 740       R/L    1874         Hulas
3      820 002 806 590 405 740       R/L    3611  Mohenjo-daro
4  140 920 484 337 503 456 400       R/L    3578  Mohenjo-daro
                   l_to_r_text direction icit_id          site
0                      003 390       R/L    2191  Mohenjo-daro
1                  235 240 520       R/L    1189       Harappa
2  861 002 003 220 590 405 740       R/L    1175       Harappa
3                          820        NR     912       Harappa
4  140 287 00

In [30]:
"""Create and Pickle Indus Core Dataframes, fwd, rev"""

df_core=pd.concat([df_core_x,df_core_y],axis=1)
print(df_core.head())

df_core_train=pd.concat([df_core_train_x,df_core_train_y],axis=1)
print(df_core_train.head())

df_core_test=pd.concat([df_core_test_x,df_core_test_y],axis=1)
print(df_core_test.head())

df_core_rev=pd.concat([df_core_x_rev,df_core_y_rev],axis=1)
print(df_core_rev.head())


df_core_train_rev=pd.concat([df_core_train_x_rev,df_core_train_y_rev],axis=1)
print(df_core_train_rev.head())

df_core_test_rev=pd.concat([df_core_test_x_rev,df_core_test_y_rev],axis=1)
print(df_core_test_rev.head())

"""Pickle all the dataframes we need"""
df_core.to_pickle('pickle/core_df.pkl')
df_core_train.to_pickle('pickle/core_train_df.pkl')
df_core_test.to_pickle('pickle/core_test_df.pkl')

df_core_rev.to_pickle('pickle/core_rev_df.pkl')
df_core_train_rev.to_pickle('pickle/core_train_rev_df.pkl')
df_core_test_rev.to_pickle('pickle/core_test_rev_df.pkl')


       l_to_r_text direction icit_id        site
0          410 017       L/R       1  Alamgirpur
1          405 017       L/R       3  Alamgirpur
2          235 740       R/L       5   Allahdino
3      590 390 740       R/L       6   Allahdino
4  033 125 390 368       R/L       7   Allahdino
                   l_to_r_text direction icit_id          site
0  513 460 036 861 002 005 390       R/L    2548  Mohenjo-daro
1                  176 100 740       R/L    1213       Harappa
2                      003 156       R/L    2307  Mohenjo-daro
3                      220 520       R/L     282       Harappa
4                  140 706 064       R/L    1194       Harappa
               l_to_r_text direction icit_id          site
0      798 233 790 900 740       R/L    2648  Mohenjo-daro
1  861 368 001 803 235 520       R/L    3219  Mohenjo-daro
2          244 065 880 820       R/L    2535  Mohenjo-daro
3                  005 390       R/L    2472  Mohenjo-daro
4              235 803 740       