### Parsing the 'PragTic' fatigue database
First of all, let's import the necessary libraries

In [1]:
# Used packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

If one looks at the 'Pragtic' dataset, one may see that it contains complicated tables with columns and rows that contain sub-columns and sub-rows (see, for example, here: https://www.pragtic.com/curve.php?action=view&type=curve&tgr_id=19&cur_id=127). There are some solutions on the internet on how to work with it. Below, there is one of them.

In [2]:
# This code snippet was taken from: https://stackoverflow.com/questions/28763891/what-should-i-do-when-tr-has-rowspan

def pre_process_table(table):
    """
    INPUT:
        1. table - a bs4 element that contains the desired table: ie <table> ... </table>
    OUTPUT:
        a tuple of: 
            1. rows - a list of table rows ie: list of <tr>...</tr> elements
            2. num_rows - number of rows in the table
            3. num_cols - number of columns in the table
    Options:
        include_td_head_count - whether to use only th or th and td to count number of columns (default: False)
    """
    rows = [x for x in table.find_all('tr')]

    num_rows = len(rows)

    # get an initial column count. Most often, this will be accurate
    num_cols = max([len(x.find_all(['th','td'])) for x in rows])

    # sometimes, the tables also contain multi-colspan headers. This accounts for that:
    header_rows_set = [x.find_all(['th', 'td']) for x in rows if len(x.find_all(['th', 'td']))>num_cols/2]

    num_cols_set = []

    for header_rows in header_rows_set:
        num_cols = 0
        for cell in header_rows:
            row_span, col_span = get_spans(cell)
            num_cols+=len([cell.getText()]*col_span)

        num_cols_set.append(num_cols)

    num_cols = max(num_cols_set)

    return (rows, num_rows, num_cols)


def get_spans(cell):
        """
        INPUT:
            1. cell - a <td>...</td> or <th>...</th> element that contains a table cell entry
        OUTPUT:
            1. a tuple with the cell's row and col spans
        """
        if cell.has_attr('rowspan'):
            rep_row = int(cell.attrs['rowspan'])
        else: # ~cell.has_attr('rowspan'):
            rep_row = 1
        if cell.has_attr('colspan'):
            rep_col = int(cell.attrs['colspan'])
        else: # ~cell.has_attr('colspan'):
            rep_col = 1 

        return (rep_row, rep_col)

def process_rows(rows, num_rows, num_cols):
    """
    INPUT:
        1. rows - a list of table rows ie <tr>...</tr> elements
    OUTPUT:
        1. data - a Pandas dataframe with the html data in it
    """
    data = pd.DataFrame(np.ones((num_rows, num_cols))*np.nan)
    for i, row in enumerate(rows):
        try:
            col_stat = data.iloc[i,:][data.iloc[i,:].isnull()].index[0]
        except IndexError:
            print(i, row)

        for j, cell in enumerate(row.find_all(['td', 'th'])):
            rep_row, rep_col = get_spans(cell)

            #print("cols {0} to {1} with rep_col={2}".format(col_stat, col_stat+rep_col, rep_col))
            #print("\trows {0} to {1} with rep_row={2}".format(i, i+rep_row, rep_row))

            #find first non-na col and fill that one
            while any(data.iloc[i,col_stat:col_stat+rep_col].notnull()):
                col_stat+=1

            data.iloc[i:i+rep_row,col_stat:col_stat+rep_col] = cell.getText()
            if col_stat<data.shape[1]-1:
                col_stat+=rep_col

    return data

def main(table):
    rows, num_rows, num_cols = pre_process_table(table)
    df = process_rows(rows, num_rows, num_cols)
    return(df)

Then, we need to go through all the pages and collect the information from them.  

In [3]:
# There are 2 cases: 4 or 3 tables and 2 tables on a webpage of a SUBgroup (BaB01, BaB02,..., Bai01, Bai02, ...). 
# So, let's make 2 DataFrame for both cases
df_test_4 = pd.DataFrame({'pragtic_test_group':[],'material':[],'Research_ref':[],'Pragtic_ref_id':[]}) #
df_test_2 = pd.DataFrame({'pragtic_test_group':[],'material':[],'Research_ref':[],'Pragtic_ref_id':[]}) #

page_0 = requests.get('http://www.pragtic.com/experiments.php') # Getting page HTML
soup_0 = BeautifulSoup(page_0.content, 'html.parser') # Parsing content by BeautifulSoup
links_0 = soup_0.select("tr td select.mode option") # Selecting all the anchors with fatigue tests

for link in links_0[2:59]:   
    href = 'https://www.pragtic.com/experiments.php?action=view&type=group&tgr_id='+link['value'] # refs for each group (BaB,...)
    page = requests.get(href) # Getting page HTML for each group (BaB, Bai, ...)
    soup = BeautifulSoup(page.content, 'html.parser') # Parsing content by BeautifulSoup for each group (BaB, Bai, ...)
    # Reference to research
    r_index = [i+1 for i in range(len(soup.select("tr td "))) if soup.select("tr td ")[i].contents[0] == 'Referenced in:'][0]
    Ref = soup.select("tr td ")[r_index].contents[1]
    pragtic_ref_id = soup.select("tr td ")[r_index].select("b")[0].contents[0].split('\n:')[0] # reference ID
    
    # Selecting all the anchors with material information
    material_link = soup.find_all('a', attrs={'href': lambda e: e.startswith(
        'https://www.pragtic.com/vmat.php?post=') if e else False})
    material_href = material_link[0]['href'] # ref with material information (Ck 35, ...)
    material_page = requests.get(material_href) # Getting page HTML for each material (Ck 35, ...)
    material_soup = BeautifulSoup(material_page.content, 'html.parser') # Parsing content by BeautifulSoup for each material
    material_tables = material_soup.select('table') # All the tables on the page 
    
    # General material information (marks, standard, group)
    rows, num_rows, num_cols = pre_process_table(material_tables[2])
    mater_mark = process_rows(rows, num_rows, num_cols).iloc[0,1] + '/'+ process_rows(rows, num_rows, num_cols).iloc[1,1]
    mater_standard = process_rows(rows, num_rows, num_cols).iloc[2,1]
    mater_group = process_rows(rows, num_rows, num_cols).iloc[3,1]
    temp_tab_mater = pd.DataFrame({'mater_mark_alt':[mater_mark],'mater_standard':[mater_standard],'mater_group':[mater_group]})
    
    # Chemical composition table
    rows, num_rows, num_cols = pre_process_table(material_tables[3])
    balance = process_rows(rows, num_rows, num_cols).iloc[-1,-1] # Information about additional chemical components
    balance_tab = pd.DataFrame({'balance':[balance]})
    # Chemical composition
    rows, num_rows, num_cols = pre_process_table(material_tables[-2])
    df_material_ch_com = process_rows(rows, num_rows, num_cols)
    if len(df_material_ch_com) > 3:
        df_material_ch_com = df_material_ch_com.iloc[:,::-1].iloc[2:-1,-4:].T.iloc[:3]
        df_material_ch_com.columns = df_material_ch_com.iloc[0].str.cat(df_material_ch_com.iloc[1]).copy()
        df_material_ch_com = df_material_ch_com.iloc[[2]].reset_index(drop=True)
        temp_tab_mater = pd.concat([temp_tab_mater,df_material_ch_com,balance_tab], axis=1) # Concatenation of tables
    else: 
        temp_tab_mater = pd.concat([temp_tab_mater,balance_tab], axis=1) # Concatenation of tables
    
    # Tables with treatment information
    rows, num_rows, num_cols = pre_process_table(material_tables[4])
    df_material_tr = process_rows(rows, num_rows, num_cols).T.copy()
    df_material_tr.columns = ['mater_ref', 'mater_comment', 'mater_treatment']
    df_material_tr = df_material_tr.iloc[[1]].reset_index(drop=True)
    temp_tab_mater = pd.concat([temp_tab_mater,df_material_tr], axis=1) # Concatenation of tables
    
    # Mechanical properties tables
    for i in range(len(material_tables[5:-2])):
        rows, num_rows, num_cols = pre_process_table(material_tables[i+5])
        df_mat_prop = process_rows(rows, num_rows, num_cols)
        if len(df_mat_prop) > 3:
            tab_name = df_mat_prop.iloc[0,0]
            df_mat_prop = df_mat_prop.iloc[:,::-1].iloc[2:-1,-4:].T.iloc[:3]
            df_mat_prop.columns = tab_name + ' ' + df_mat_prop.iloc[0].str.cat(df_mat_prop.iloc[1])
            df_mat_prop = df_mat_prop.iloc[[2]].reset_index(drop=True)
            temp_tab_mater = pd.concat([temp_tab_mater,df_mat_prop], axis=1)
    
    # Selecting all the anchors with fatigue SUBgroup (BaB01, BaB02,..., Bai01, Bai02, ...)
    s_links = soup.find_all('a', attrs={'href': lambda e: e.startswith(
                'https://www.pragtic.com/curve.php?action=view&type=curve&tgr_id') if e else False})    
    
    for i in range(len(s_links)):
        s_href = 'https://www.pragtic.com/curve.php?action=view&type=curve&tgr_id=' + link['value'] +\
                    '&cur_id=' + s_links[i]['href'].split('cur_id=')[-1] # refs for each SUBgroup (BaB01,..., Bai01, ...)
        s_page = requests.get(s_href) # Getting page HTML for each SUBgroup (BaB01, BaB02,..., Bai01, Bai02, ...)
        s_soup = BeautifulSoup(s_page.content, 'html.parser') # Parsing content by BeautifulSoup for each SUBgroup
        s_tables = s_soup.select('table') # All the tables for each SUBgroup (BaB01, BaB02,..., Bai01, Bai02, ...)
        
        # Creating the complex table with specimen information
        rows_2, num_rows_2, num_cols_2 = pre_process_table(s_tables[-1])
        test_tab_2 = process_rows(rows_2, num_rows_2, num_cols_2)
        spec_type = test_tab_2.loc[0,0] # Specimen type
        spec_comment = test_tab_2.iloc[-1,1] # Commentary about specimens
        # Table with information about specimen geometry
        temp_tab_2 = pd.DataFrame({'Specimen_type':[],'Specimen_comment':[]})
        test_tab_21 = test_tab_2.iloc[1:-1,:2].set_index(0).T.reset_index(drop=True)
        temp_tab_2 = pd.concat([temp_tab_2,test_tab_21,temp_tab_mater], axis=1)
        temp_tab_2.fillna({'Specimen_type':spec_type,'Specimen_comment':spec_comment}, inplace=True)
        
        if len(s_tables[1:]) == 4 or len(s_tables[1:]) == 3:
            # Creating the complex table
            rows, num_rows, num_cols = pre_process_table(s_tables[2])
            test_tab_3_4 = process_rows(rows, num_rows, num_cols)
            # Cleaning the table
            test_tab_3_4.columns = test_tab_3_4.loc[0].str.cat(test_tab_3_4.loc[1]).copy() # Renaming columns
            test_tab_3_4.drop([0, 1], inplace=True) # Dropping the first 2 rows
            test_tab_3_4[temp_tab_2.columns] = temp_tab_2.iloc[0]
            df_test_4 = pd.concat([df_test_4,test_tab_3_4])
            
        elif len(s_tables[1:]) == 2: 
            # Creating the complex table
            rows, num_rows, num_cols = pre_process_table(s_tables[1])
            test_tab = process_rows(rows, num_rows, num_cols)
            # Number of cycles at fatigue limit
            try: fat_life = float(test_tab[test_tab[0] == 'Number of cycles at fat. lim.:'].values[0, 1])
            except: fat_life = 1 # if there is no fatigue life
            test_tab1 = test_tab[test_tab[0] == 'Load channels:'] # It contains only information about loading
            try: fat_lim = float(test_tab[test_tab[0] == 'Fatigue limit:'].values[0, 1]) # Fatigue limit
            except: fat_lim = 0 # if there is no fatigue limit
            Mark = test_tab[test_tab[0] == 'Curve mark:'].values[0, 1] # Name of SUBgroup (Ber01,..., TAK01, ...)
            temp_tab = pd.DataFrame({'Mark':[],'N':[], 'Completed':[]}) 
            temp_tab = pd.concat([temp_tab_2,temp_tab], axis=1).reset_index(drop=True)
            for i in range(2,len(test_tab1.index),3):
                col1 = test_tab1.iloc[i-1][test_tab1.iloc[i-1] == '\xa0'] # Columns without relevant information
                test_tab11 = test_tab1.iloc[i-2:i+1].drop(columns=col1.index) # Dropping those columns
                test_tab11.columns = test_tab11.iloc[0]\
                        .str.cat(test_tab11.iloc[1]).copy() # Renaming columns by combining info from the 1st two rows
                test_tab11.drop(test_tab11.index[:2], inplace=True) # Dropping the 1st two rows
                # Multiplyer of the fatigue limit
                try: mult = float(test_tab11.iloc[0,:][test_tab11.iloc[0,:].str.contains('x')].values[0].split('x')[1])
                except: mult = 0
                # Multiplication of the fatigue limit and multiplier
                test_tab11.iloc[0,:][test_tab11.iloc[0,:].str.contains('x')] = mult*fat_lim
                test_tab11.drop(columns=test_tab11.columns[:2], inplace=True) # Dropping columns without relevant information
                test_tab11.reset_index(drop=True, inplace=True)
                temp_tab = pd.concat([temp_tab,test_tab11], axis=1) # Concatenation of tables
                # Filling NaN values in columns
                temp_tab.fillna({'N':fat_life,'Completed':'Fatigue limit','Mark':Mark}, inplace=True)
            df_test_2 = pd.concat([df_test_2,temp_tab])
    
    # Filling NaN values in columns
    df_test_4.fillna({'pragtic_test_group':link.contents[0],'material':material_link[0].contents[0],
                      'Research_ref':Ref,'Pragtic_ref_id':pragtic_ref_id}, inplace=True)
    df_test_2.fillna({'pragtic_test_group':link.contents[0],'material':material_link[0].contents[0],
                      'Research_ref':Ref,'Pragtic_ref_id':pragtic_ref_id}, inplace=True)

The column names look a bit messy, so let's correct them

In [6]:
# Renaming columns
df_test_4.rename(columns={"MarkMark": "Mark", "NN": "N", 'CompletedCompleted':'Completed',
                         'Static data Ultimate tensile strengthMPa': 'Ultim_tens_str_MPa',
                         'Static data Ultimate shear strengthMPa': 'Ultim_shear_str_MPa',
                         'Static data Tensile yield stressMPa': 'Tens_yield_str_MPa',
                         'Static data Shear yield stressMPa': 'Shear_yield_str_MPa',
                         'Fully reversed push-pull Diameter of specimen at active cross-sectionmm':'Ful_rev_T_C_Spec_Diam_mm',
                         'Fully reversed push-pull Fatigue limitMPa': 'Ful_rev_T_C_Fat_lim_MPa',
                         'Fully reversed push-pull Exponent of S-N curve-': 'Ful_rev_T_C_Exp_SN_curv',
                         'Fully reversed push-pull Number of cycles at fatigue limit-': 'Ful_rev_T_C_N_for_fat_lim',
                         'Repeated tension Fatigue limitMPa': 'Repeat_T_Fat_lim_MPa',
                         'Repeated tension Exponent of S-N curve-': 'Repeat_T_Exp_SN_curv',
                         'Repeated tension Number of cycles at fatigue limit-': 'Repeat_T_N_for_fat_lim',
                         'Fully reversed torsion Diameter of specimen at active cross-sectionmm': 'Ful_rev_Tor_Spec_Diam_mm',
                         'Fully reversed torsion Fatigue limitMPa': 'Ful_rev_Tor_Fat_lim_MPa',
                         'Fully reversed torsion Exponent of S-N curve-': 'Ful_rev_Tor_Exp_SN_curv',
                         'Fully reversed torsion Number of cycles at fatigue limit-': 'Ful_rev_Tor_N_for_fat_lim',
                         'Repeated torsion Fatigue limitMPa': 'Repeat_Tor_Fat_lim_MPa',
                         'Repeated torsion Exponent of S-N curve-': 'Repeat_Tor_Exp_SN_curv',
                         'Repeated torsion Number of cycles at fatigue limit-': 'Repeat_Tor_N_for_fat_lim',
                         'Static data Reduction of area at fracture%': 'Reduction_area_frac_%',
                         'Static data Ambient temperaturedegC': 'Stat_temper_C',
                         'Static data Tensile elasticity modulusMPa': 'Tens_elast_modul_MPa',
                         'Static data Elongation at fracture%': 'Stat_Elong_frac_%',
                         'To - harmonic, constant amplitudeA': 't_a_MPa',
                         'To - harmonic, constant amplitudeF': 'Freq_t_Hz',
                         'To - harmonic, constant amplitudePS': 'PS_t_deg',
                         'TP - harmonic, constant amplitudeA': 'tan_a(pres)_MPa)',
                         'TP - harmonic, constant amplitudeM': 'tan_m(pres)_MPa)',
                         'TP - harmonic, constant amplitudeF': 'Freq_tan_Hz',
                         'TP - harmonic, constant amplitudePS': 'PS_tan_deg',
                         'Carbon content%': 'Carbon_%', 'Silicon content%': 'Silicon_%', 
                         'Manganese content%': 'Manganese_%', 'Phosphorus content%': 'Phosphorus_%', 
                         'Sulphur content%': 'Sulphur_%',}, inplace=True)

df_test_2.rename(columns={'Static data Ultimate tensile strengthMPa': 'Ultim_tens_str_MPa',
                         'Static data Ultimate shear strengthMPa': 'Ultim_shear_str_MPa',
                         'Static data Tensile yield stressMPa': 'Tens_yield_str_MPa',
                         'Static data Shear yield stressMPa': 'Shear_yield_str_MPa',
                         'Fully reversed push-pull Diameter of specimen at active cross-sectionmm':'Ful_rev_T_C_Spec_Diam_mm',
                         'Fully reversed push-pull Fatigue limitMPa': 'Ful_rev_T_C_Fat_lim_MPa',
                         'Fully reversed push-pull Exponent of S-N curve-': 'Ful_rev_T_C_Exp_SN_curv',
                         'Fully reversed push-pull Number of cycles at fatigue limit-': 'Ful_rev_T_C_N_for_fat_lim',
                         'Repeated tension Fatigue limitMPa': 'Repeat_T_Fat_lim_MPa',
                         'Repeated tension Exponent of S-N curve-': 'Repeat_T_Exp_SN_curv',
                         'Repeated tension Number of cycles at fatigue limit-': 'Repeat_T_N_for_fat_lim',
                         'Fully reversed torsion Diameter of specimen at active cross-sectionmm': 'Ful_rev_Tor_Spec_Diam_mm',
                         'Fully reversed torsion Fatigue limitMPa': 'Ful_rev_Tor_Fat_lim_MPa',
                         'Fully reversed torsion Exponent of S-N curve-': 'Ful_rev_Tor_Exp_SN_curv',
                         'Fully reversed torsion Number of cycles at fatigue limit-': 'Ful_rev_Tor_N_for_fat_lim',
                         'Repeated torsion Fatigue limitMPa': 'Repeat_Tor_Fat_lim_MPa',
                         'Repeated torsion Exponent of S-N curve-': 'Repeat_Tor_Exp_SN_curv',
                         'Repeated torsion Number of cycles at fatigue limit-': 'Repeat_Tor_N_for_fat_lim',
                         'Static data Reduction of area at fracture%': 'Reduction_area_frac_%',
                         'Static data Ambient temperaturedegC': 'Stat_temper_C',
                         'Static data Tensile elasticity modulusMPa': 'Tens_elast_modul_MPa',
                         'Static data Elongation at fracture%': 'Stat_Elong_frac_%',
                         'Static data Hardness (acc. to Rockwell)-': 'Stat_Hardness_Rockwell',
                         'Static data Hardness (acc. to Brinell)-': 'Stat_Hardness_Brinell', 
                         'Static data Hardness (acc. to Vickers)-': 'Stat_Hardness_Vickers',
                         'Static data Coefficient of static strengthMPa': 'Coef_stat_str_MPa',
                         'Static data Exponent of static strength-': 'Exp_stat_str',
                         'Fully reversed push-pull Load frequencyHz': 'Ful_rev_T_C_freq_Hz',
                         "Static data Poisson's ratio-": 'Pois_rat',
                         'Fully reversed bending Fatigue limitMPa': 'Ful_rev_bend_fat_lim_MPa',
                         'Rotating bending Fatigue limitMPa': 'Rotat_bend_fat_lim_MPa',
                         'Fully reversed bending Number of cycles at fatigue limit-': 'Ful_rev_bend_N_for_fat_lim',
                         'Repeated bending Fatigue limitMPa': 'Repeat_bend_fat_lim_MPa',
                         'Fully reversed bending Diameter of specimen at active cross-sectionmm': 'Ful_rev_bend_Spec_Diam_mm',
                         'Repeated bending Number of cycles at fatigue limit-': 'Repeat_bend_N_for_fat_lim',
                         'Fully reversed bending Exponent of S-N curve-': 'Ful_rev_bend_Exp_SN_curv',
                         'Static data Shear elasticity modulusMPa': 'Shear_elast_modul_MPa',
                         'Carbon content%': 'Carbon_%', 'Silicon content%': 'Silicon_%', 
                         'Manganese content%': 'Manganese_%', 'Phosphorus content%': 'Phosphorus_%', 
                         'Sulphur content%': 'Sulphur_%', 'Niobium content%':'Niobium_%',
                         'Chromium content%':'Chromium_%', 'Molybdenum content%':'Molybdenum_%', 
                         'Nickel content%':'Nickel_%', 'Vanadium content%':'Vanadium_%', 
                         'Wolfram content%':'Wolfram_%', 'Titanium content%':'Titanium_%',
                         'Copper content%':'Copper_%', 'Aluminium content%':'Aluminium_%'}, inplace=True)

# Dropping columns
df_test_4.drop(columns={'CommentaryCommentary', 'mater_group'}, inplace=True)
df_test_2.drop(columns={'mater_group'}, inplace=True)

# Dropping indexes
df_test_4.reset_index(drop=True, inplace=True)
df_test_2.reset_index(drop=True, inplace=True)

Bending, rotating bending, and push-pull tests provide slightly different results. It is due to, for instance, stress gradient on the surface in case of bending. One needs to take it into account. Therefore, let's create the mark if it is a bending test or rotating bending. After that, let's combine the results for axial, shear, and tangential stresses and then drop irrelevant columns.

In [7]:
# Creating the mark if it is a bending test
df_test_4['Bending'] = 0
bend_ind_4 = df_test_4[['PB - harmonic, constant amplitudeA', 'PB - harmonic, constant amplitudeM', 
                        'PB - harmonic, constant amplitudeF', 'PB - harmonic, constant amplitudePS']].dropna(how='all').index
df_test_4.loc[bend_ind_4,'Bending'] = 1

df_test_2['Bending'] = 0
bend_ind_2 = df_test_2[['plane bending: constant loadM', 'plane bending: harmonic, constant amplitude\xa0\xa0Master - AA',
                       'plane bending: harmonic, constant amplitude\xa0\xa0Master - AM',
                       'plane bending: harmonic, constant amplitude\xa0\xa0Master - AF',
                       'plane bending: harmonic, constant amplitude\xa0\xa0Master - APS']].dropna(how='all').index
df_test_2.loc[bend_ind_2,'Bending'] = 1

df_test_2['Rot_Bending'] = 0
rot_bend_ind_2 = df_test_2[['rotating bending: harmonic, constant amplitude\xa0\xa0Master - AA',
                           'rotating bending: harmonic, constant amplitude\xa0\xa0Master - AM',
                           'rotating bending: harmonic, constant amplitude\xa0\xa0Master - AF',
                           'rotating bending: harmonic, constant amplitude\xa0\xa0Master - APS']].dropna(how='all').index
df_test_2.loc[rot_bend_ind_2,'Rot_Bending'] = 1

In [8]:
# Combining axial stress data
df_test_2['S_m_MPa'] = df_test_2['tension-compression: constant loadM'].fillna(0).astype(float) \
                + df_test_2['tension-compression: harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float) \
                + df_test_2['no load: harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float) \
                + df_test_2['plane bending: constant loadM'].fillna(0).astype(float) \
                + df_test_2['plane bending: harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float) \
                + df_test_2['rotating bending: harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float)

df_test_2['S_a_MPa'] = df_test_2['no load: harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float) \
                + df_test_2['tension-compression: harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float) \
                + df_test_2['plane bending: harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float) \
                + df_test_2['rotating bending: harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float) 

df_test_2['Freq_S_Hz'] = df_test_2['no load: harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float) \
                + df_test_2['tension-compression: harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float) \
                + df_test_2['plane bending: harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float) \
                + df_test_2['rotating bending: harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float) 

df_test_2['PS_S_deg'] = df_test_2['no load: harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float) \
                + df_test_2['tension-compression: harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float) \
                + df_test_2['plane bending: harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float) \
                + df_test_2['rotating bending: harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float)

# Combining shear stress data
df_test_2['t_m_MPa'] = df_test_2['torsion: harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float) \
                + df_test_2['torsion: harmonic, constant amplitudeM'].fillna(0).astype(float) \
                + df_test_2['torsion: constant loadM'].fillna(0).astype(float)

df_test_2['t_a_MPa'] = df_test_2['torsion: harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float) \
                + df_test_2['torsion: harmonic, constant amplitudeA'].fillna(0).astype(float)

df_test_2['Freq_t_Hz'] = df_test_2['torsion: harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float) \
                + df_test_2['torsion: harmonic, constant amplitudeF'].fillna(0).astype(float)

df_test_2['PS_t_deg'] = df_test_2['torsion: harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float) \
                + df_test_2['torsion: harmonic, constant amplitudePS'].fillna(0).astype(float)

# Combining tangential stress data
df_test_2['tan_m(pres)_MPa)'] = df_test_2['tangential stress (pressurizing): constant loadM'].fillna(0).astype(float) \
    + df_test_2['tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AM'].fillna(0).astype(float) \
    + df_test_2['tangential stress (pressurizing): harmonic, constant amplitudeM'].fillna(0).astype(float)

df_test_2['tan_a(pres)_MPa)'] = df_test_2['tangential stress (pressurizing): harmonic, constant amplitudeA'].fillna(0).astype(float) \
    + df_test_2['tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AA'].fillna(0).astype(float)

df_test_2['Freq_tan_Hz'] = df_test_2['tangential stress (pressurizing): harmonic, constant amplitudeF'].fillna(0).astype(float)\
    + df_test_2['tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AF'].fillna(0).astype(float)

df_test_2['PS_tan_deg'] = df_test_2['tangential stress (pressurizing): harmonic, constant amplitudePS'].fillna(0).astype(float)\
    + df_test_2['tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - APS'].fillna(0).astype(float)

In [10]:
# Dropping columns
df_test_2.drop(columns={'tension-compression: constant loadM',
                        'tension-compression: harmonic, constant amplitude\xa0\xa0Master - AM',
                        'no load: harmonic, constant amplitude\xa0\xa0Master - AM',
                        'plane bending: constant loadM',
                        'plane bending: harmonic, constant amplitude\xa0\xa0Master - AM',
                        'rotating bending: harmonic, constant amplitude\xa0\xa0Master - AM',
                        'no load: harmonic, constant amplitude\xa0\xa0Master - AA',
                        'tension-compression: harmonic, constant amplitude\xa0\xa0Master - AA',
                        'plane bending: harmonic, constant amplitude\xa0\xa0Master - AA',
                        'rotating bending: harmonic, constant amplitude\xa0\xa0Master - AA',
                        'no load: harmonic, constant amplitude\xa0\xa0Master - AF',
                        'tension-compression: harmonic, constant amplitude\xa0\xa0Master - AF',
                        'plane bending: harmonic, constant amplitude\xa0\xa0Master - AF',
                        'rotating bending: harmonic, constant amplitude\xa0\xa0Master - AF',
                        'no load: harmonic, constant amplitude\xa0\xa0Master - APS',
                        'tension-compression: harmonic, constant amplitude\xa0\xa0Master - APS',
                        'plane bending: harmonic, constant amplitude\xa0\xa0Master - APS',
                        'rotating bending: harmonic, constant amplitude\xa0\xa0Master - APS',
                        'torsion: harmonic, constant amplitude\xa0\xa0Master - AM',
                        'torsion: harmonic, constant amplitudeM',
                        'torsion: constant loadM',
                        'torsion: harmonic, constant amplitude\xa0\xa0Master - AA',
                        'torsion: harmonic, constant amplitudeA',
                        'torsion: harmonic, constant amplitude\xa0\xa0Master - AF',
                        'torsion: harmonic, constant amplitudeF',
                        'torsion: harmonic, constant amplitude\xa0\xa0Master - APS',
                        'torsion: harmonic, constant amplitudePS',
                        'tangential stress (pressurizing): constant loadM',
                        'tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AM',
                        'tangential stress (pressurizing): harmonic, constant amplitudeM',
                        'tangential stress (pressurizing): harmonic, constant amplitudeA',
                        'tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AA',
                        'tangential stress (pressurizing): harmonic, constant amplitudeF',
                        'tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - AF',
                        'tangential stress (pressurizing): harmonic, constant amplitudePS',
                        'tangential stress (pressurizing): harmonic, constant amplitude\xa0\xa0Master - APS'}, inplace=True)

In [11]:
# Combining axial stress data
df_test_4['S_m_MPa'] = df_test_4['Ten - constant loadM'].fillna(0).astype(float) \
                    + df_test_4['Ten - harmonic, constant amplitudeM'].fillna(0).astype(float)\
                    + df_test_4['PB - harmonic, constant amplitudeM'].fillna(0).astype(float)

df_test_4['S_a_MPa'] = df_test_4['Ten - harmonic, constant amplitudeA'].fillna(0).astype(float)\
                    + df_test_4['PB - harmonic, constant amplitudeA'].fillna(0).astype(float)

df_test_4['Freq_S_Hz'] = df_test_4['Ten - harmonic, constant amplitudeF'].fillna(0).astype(float)\
                    + df_test_4['PB - harmonic, constant amplitudeF'].fillna(0).astype(float)

df_test_4['PS_S_deg'] = df_test_4['Ten - harmonic, constant amplitudePS'].fillna(0).astype(float)\
                    + df_test_4['PB - harmonic, constant amplitudePS'].fillna(0).astype(float)

# Combining shear stress data
df_test_4['t_m_MPa'] = df_test_4['To - constant loadM'].fillna(0).astype(float) \
                    + df_test_4['To - harmonic, constant amplitudeM'].fillna(0).astype(float)

In [12]:
# Dropping columns
df_test_4.drop(columns={'Ten - constant loadM', 'Ten - harmonic, constant amplitudeM', 'PB - harmonic, constant amplitudeM', 
                        'PB - harmonic, constant amplitudeA', 'Ten - harmonic, constant amplitudeA',
                        'Ten - harmonic, constant amplitudeF', 'PB - harmonic, constant amplitudeF',
                        'Ten - harmonic, constant amplitudePS', 'PB - harmonic, constant amplitudePS',
                        'To - constant loadM', 'To - harmonic, constant amplitudeM'}, inplace=True)

The part surface area may affect the fatigue resistance, therefore the next step is to calculate values of the inner and outer surface area as well as the cross-section area of used specimens.

In [13]:
# Creating new columns with zeros values
df_test_4['CS_Area'] = 0
df_test_4['Out_Cov_Area'] = 0
df_test_4['Inn_Cov_Area'] = 0

# Filling these new columns with calculated values
for i in df_test_4.index:
    # A solid bar of circular cross-section
    if df_test_4['Specimen_type'].loc[i] == 'Solid bar of circular cross-section, unnotched':
        # The cross-section area of the specimen
        df_test_4['CS_Area'].loc[i] = np.pi/4 * (float(df_test_4['v1 - Diameter [mm]:'].loc[i]))**2
        # The outer surface area of the specimen's work part
        df_test_4['Out_Cov_Area'].loc[i] = np.pi * float(df_test_4['v1 - Diameter [mm]:'].loc[i]) * \
            float(df_test_4['v2 - Active length [mm]:'].loc[i])
        # The inner surface area of the specimen's work part
        df_test_4['Inn_Cov_Area'].loc[i] = 0
    # Flat solid specimen (GBA)
    if (df_test_4['Specimen_type'].loc[i] == 'Flat solid specimen, unnotched' and 
                df_test_4['pragtic_test_group'].loc[i] == 'GBA'):
        # The cross-section area of the specimen
        df_test_4['CS_Area'].loc[i] = float(df_test_4['v1 - Width [mm]:'].loc[i]) * \
            float(df_test_4['v6 - Thickness [mm]:'].loc[i])
        # The outer surface area of the specimen's work part
        df_test_4['Out_Cov_Area'].loc[i] = (float(df_test_4['v1 - Width [mm]:'].loc[i]) + \
            float(df_test_4['v6 - Thickness [mm]:'].loc[i])) * 2 * \
                float(df_test_4['v2 - Active length [mm]:'].loc[i])
        # The inner surface area of the specimen's work part
        df_test_4['Inn_Cov_Area'].loc[i] = 0
    # Flat solid specimen (ST_S)
    if (df_test_4['Specimen_type'].loc[i] == 'Flat solid specimen, unnotched' and 
                df_test_4['pragtic_test_group'].loc[i] == 'ST_S'):
        # The cross-section area of the specimen
        df_test_4['CS_Area'].loc[i] = float(df_test_4['v1 - Width [mm]:'].loc[i]) * \
            float(df_test_4['v3 - Fillet radius [mm]:'].loc[i])
        # The outer surface area of the specimen's work part
        df_test_4['Out_Cov_Area'].loc[i] = (float(df_test_4['v1 - Width [mm]:'].loc[i]) + \
            float(df_test_4['v3 - Fillet radius [mm]:'].loc[i])) * 2 * \
                float(df_test_4['v4 - Total length [mm]:'].loc[i])
        # The inner surface area of the specimen's work part
        df_test_4['Inn_Cov_Area'].loc[i] = 0
    # Hollow bar of circular cross-section
    if df_test_4['Specimen_type'].loc[i] == 'Hollow bar of circular cross-section, unnotched':
        # The cross-section area of the specimen
        df_test_4['CS_Area'].loc[i] = np.pi/4 * ((float(df_test_4['v1 - Outer diameter [mm]:'].loc[i]))**2 - \
                                                (float(df_test_4['v2 - Inner diameter [mm]:'].loc[i]))**2)
        # The outer surface area of the specimen's work part
        df_test_4['Out_Cov_Area'].loc[i] = np.pi * float(df_test_4['v1 - Outer diameter [mm]:'].loc[i]) * \
            float(df_test_4['v3 - Active length [mm]:'].loc[i])
        # The inner surface area of the specimen's work part
        df_test_4['Inn_Cov_Area'].loc[i] = np.pi * float(df_test_4['v2 - Inner diameter [mm]:'].loc[i]) * \
            float(df_test_4['v3 - Active length [mm]:'].loc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [14]:
# If there is no information about specimen geometry, let's drop the columns
df_test_2.drop(columns={'v1 - Diameter [mm]:','v2 - Active length [mm]:','v3 - Fillet radius [mm]:',
                        'v5 - Total length [mm]:', 'v4 - Diameter at fixture [mm]:'}, inplace=True)

The last step is to save the resulting DataFrame to a 'CSV' file.

In [1277]:
df_fin = pd.concat([df_4_fin, df_2_fin]).reset_index(drop=True)
df_fin.to_csv('fat_dataset.csv')